def train_models(save_path, hyperpars, overwrite_train, train_on_all_data, remove_overlap_chunks, train_all_previous, target_quantile, train_last_six_complete=False): (features, other_features, targets) = preprocess.get_preprocessed( 'train', remove_overlap_chunks, scale=NORMALIZE_FEATURES, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, target_quantile=target_quantile, train_last_six_complete=train_last_six_complete) train_val_split = preprocess.train_val_split( remove_overlap_chunks, ordered=True, num_folds=NUM_FOLDS, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, train_all_previous=train_all_previous, target_quantile=target_quantile, train_last_six_complete=train_last_six_complete) num_folds = len(train_val_split) num_train_models = num_folds + int(train_on_all_data) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold + 1, num_train_models)) fit_model(save_path, fold, num_folds, train_val_split, hyperpars, overwrite_train, features, other_features, targets, target_quantile)
def validate_models(save_path, splits, remove_overlap_chunks): (features, other_features, targets) = preprocess.get_preprocessed('train', remove_overlap_chunks, scale=NORMALIZE_FEATURES) train_val_split = preprocess.train_val_split(remove_overlap_chunks) num_folds = len(train_val_split[0]) valid_maes = [] for split_id, split in enumerate(splits): print('Processing split {} of {}'.format(split_id + 1, len(splits))) sum_split_maes = [] split_maes = [] for fold in range(num_folds): print('Processing fold {} of {}'.format(fold + 1, num_folds)) fold_mae, oof_count = get_fold_mae(save_path, split, fold, num_folds, train_val_split, features, targets) sum_split_maes.append(fold_mae * oof_count) split_maes.append((fold_mae, oof_count)) split_mae = np.array(sum_split_maes).sum() / features.shape[0] split_maes = [split_mae] + split_maes print('Split OOF MAE: {}'.format(np.round(split_mae, 3))) valid_maes.append((split_maes, splits[split_id])) return valid_maes
def test_model(save_path, split, test_on_all_data): (x_test, other_test, _) = preprocess.get_preprocessed('test', remove_overlap_chunks=True, scale=NORMALIZE_FEATURES) train_val_split = preprocess.train_val_split(remove_overlap_chunks=True) num_folds = len(train_val_split[split]) num_test = x_test.shape[0] num_prediction_models = num_folds + int(test_on_all_data) model_preds = np.zeros((num_test, num_prediction_models)) for fold in range(num_prediction_models): print("Making test predictions {} of {}".format( fold + 1, num_prediction_models)) fold_description = get_fold_description(fold, num_folds) model_path = '{}-{}-{}.txt'.format(save_path, split, fold_description) model = lgb.Booster(model_file=model_path) model_preds[:, fold] = model.predict(x_test) # Write the output pandas data frame preds_test = np.mean(model_preds, 1) submission = pd.read_csv( '/home/tom/Kaggle/LANL/Data/sample_submission.csv') submission.time_to_failure = preds_test the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M') submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv' submission.to_csv(submission_path, index=False)
def validate_models(save_path, hyperpars, remove_overlap_chunks, train_all_previous, target_quantile): train_val_split = preprocess.train_val_split( remove_overlap_chunks, ordered=True, num_folds=NUM_FOLDS, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, train_all_previous=train_all_previous, target_quantile=target_quantile) num_folds = len(train_val_split) sum_maes = [] maes = [] fold_mae_norms = [] total_count = 0 for fold in range(num_folds): print('Processing fold {} of {}'.format(fold+1, num_folds)) fold_mae, fold_mae_norm, oof_count = get_fold_mae( save_path, fold, num_folds, train_val_split, hyperpars, target_quantile, hyperpars['validation_valid_batch']) sum_maes.append(fold_mae*oof_count) maes.append((fold_mae, oof_count)) fold_mae_norms.append(fold_mae_norm) total_count += oof_count av_mae_norm = np.array([n*c for (n, c) in zip( fold_mae_norms, [c for (m, c) in maes])]).sum()/total_count mae = np.array(sum_maes).sum()/total_count maes = [mae] + maes print('\nAverage OOF MAE: {}'.format(np.round(mae, 3))) print('Average OOF MAE normalized: {}'.format(np.round(av_mae_norm, 3))) return maes, av_mae_norm
def test_model(save_path, split, model_on_all_data): (x_test, other_test, _) = preprocess.get_preprocessed('test', remove_overlap_chunks=True, scale=True) (x_test_reshaped, _) = utils.reshape_time_dim(x_test, np.zeros_like(x_test), np.arange(x_test.shape[0])) train_val_split = preprocess.train_val_split(remove_overlap_chunks=True) num_folds = len(train_val_split[split]) num_test = x_test.shape[0] num_prediction_models = num_folds + int(model_on_all_data) model_preds = np.zeros((num_test, num_prediction_models)) for fold in range(num_prediction_models): print("Making test predictions {} of {}".format( fold + 1, num_prediction_models)) model_preds[:, fold] = make_predictions(save_path, split, fold, num_folds, x_test_reshaped) # Write the output pandas data frame preds_test = np.mean(model_preds, 1) submission = pd.read_csv( '/home/tom/Kaggle/LANL/Data/sample_submission.csv') submission.time_to_failure = preds_test the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M') submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv' submission.to_csv(submission_path, index=False)
def test_model(save_path, test_on_all_folds, train_all_previous, target_quantile, median_test_cyle_length, seed_ext=None, train_last_six_complete=False, drop_first_test_fold=False): (x_test, other_test, _) = preprocess.get_preprocessed( 'test', remove_overlap_chunks=True, scale=NORMALIZE_FEATURES, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, target_quantile=target_quantile) train_val_split = preprocess.train_val_split( ordered=True, remove_overlap_chunks=True, num_folds=NUM_FOLDS, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, train_all_previous=train_all_previous, target_quantile=target_quantile) num_folds = len(train_val_split) num_folds = 1 if train_last_six_complete else num_folds pred_folds = [f for f in range(num_folds) ] if test_on_all_folds else [num_folds - 1] model_preds = np.zeros((x_test.shape[0], len(pred_folds))) for (i, fold) in enumerate(pred_folds): print("Making test predictions {} of {}".format( i + 1, len(pred_folds))) fold_description = get_fold_description(fold, num_folds) model_path = '{}-{}.txt'.format(save_path, fold_description) model = lgb.Booster(model_file=model_path) model_preds[:, i] = model.predict(x_test) model_preds = model_preds[:, 1:] if drop_first_test_fold else model_preds preds_test = np.mean(model_preds, 1) if target_quantile: preds_test = median_test_cyle_length * (1 - preds_test) # Write the output pandas data frame submission = pd.read_csv(DATA_FOLDER + 'sample_submission.csv') submission.time_to_failure = preds_test the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M') the_date = the_date if seed_ext is None else the_date + seed_ext submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv' submission.to_csv(submission_path, index=False)
def train_models(custom_model, save_path, hyperpars, overwrite_train, train_on_all_data, remove_overlap_chunks, train_all_previous, skip_last_train_fold, target_quantile, train_last_six_complete=False): train_val_split = preprocess.train_val_split( remove_overlap_chunks, ordered=True, num_folds=NUM_FOLDS, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, train_all_previous=train_all_previous, target_quantile=target_quantile) num_folds = len(train_val_split) if not train_last_six_complete else 1 num_train_models = num_folds + int(train_on_all_data) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold+1, num_train_models)) K.clear_session() fit_model(custom_model, save_path, fold, num_folds, skip_last_train_fold, train_val_split, hyperpars, overwrite_train, target_quantile, train_last_six_complete)
def test_model(save_path, test_on_all_folds, train_all_previous, target_quantile, median_test_cyle_length, seed_ext=None, train_last_six_complete=False, drop_first_test_fold=False): train_val_split = preprocess.train_val_split( ordered=True, remove_overlap_chunks=True, num_folds=NUM_FOLDS, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, train_all_previous=train_all_previous, target_quantile=target_quantile) test_file_steps = int(150000/hyperpars['block_steps']) num_test_files = int(TEST_DATA.shape[0]/test_file_steps) test_ranges = (test_file_steps*np.arange(num_test_files), test_file_steps*(1+np.arange(num_test_files))-( hyperpars['chunk_blocks'])) (x_test_batched, test_start_rows) = utils.get_rnn_prediction_features( TEST_DATA, test_ranges, hyperpars, order_start_rows=True) x_test_batched = np.repeat(x_test_batched, 20, 0) num_folds = len(train_val_split) num_folds = 1 if train_last_six_complete else num_folds pred_folds = [f for f in range(num_folds)] if test_on_all_folds else [ num_folds-1] model_preds = np.zeros((num_test_files, len(pred_folds))) for (i, fold) in enumerate(pred_folds): print('Making test predictions {} of {}'.format(i+1, len(pred_folds))) # K.clear_session() # DO NOT UNCOMMENT fold_test_preds = make_predictions(save_path, hyperpars, fold, num_folds, x_test_batched) model_preds[:, fold] = np.mean( fold_test_preds.reshape(num_test_files, -1), 1) model_preds = model_preds[:, 1:] if drop_first_test_fold else model_preds preds_test = np.median(model_preds, 1) if target_quantile: preds_test = median_test_cyle_length*(1-preds_test) # Write the output pandas data frame submission = pd.read_csv(data_folder + 'sample_submission.csv') submission.time_to_failure = preds_test the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M') the_date = the_date if seed_ext is None else the_date + seed_ext submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv' submission.to_csv(submission_path, index=False)
def train_models(save_path, splits, hyperpars, overwrite_train, early_stopping, train_on_all_data, remove_overlap_chunks): (features, other_features, targets) = preprocess.get_preprocessed('train', remove_overlap_chunks, scale=NORMALIZE_FEATURES) train_val_split = preprocess.train_val_split(remove_overlap_chunks) num_folds = len(train_val_split[0]) num_train_models = num_folds + int(train_on_all_data) for split_id, split in enumerate(splits): print('Processing split {} of {}'.format(split_id + 1, len(splits))) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold + 1, num_train_models)) fit_model(save_path, split, fold, num_folds, train_val_split, hyperpars, overwrite_train, features, other_features, targets, early_stopping)
def validate_models(save_path, remove_overlap_chunks, train_all_previous, target_quantile, train_last_six_complete=False): (features, other_features, targets) = preprocess.get_preprocessed( 'train', remove_overlap_chunks, scale=NORMALIZE_FEATURES, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, target_quantile=target_quantile) train_val_split = preprocess.train_val_split( remove_overlap_chunks, ordered=True, num_folds=NUM_FOLDS, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, train_all_previous=train_all_previous, target_quantile=target_quantile) if target_quantile: targets = other_features.target_original.values num_folds = len(train_val_split) sum_maes = [] maes = [] fold_mae_norms = [] total_count = 0 for fold in range(num_folds): print('\nProcessing fold {} of {}'.format(fold + 1, num_folds)) fold_mae, fold_mae_norm, oof_count = get_fold_mae( save_path, fold, num_folds, train_val_split, features, targets, target_quantile) sum_maes.append(fold_mae * oof_count) maes.append((fold_mae, oof_count)) fold_mae_norms.append(fold_mae_norm) total_count += oof_count av_mae_norm = np.array([ n * c for (n, c) in zip(fold_mae_norms, [c for (m, c) in maes]) ]).sum() / total_count mae = np.array(sum_maes).sum() / total_count maes = [mae] + maes print('\nAverage OOF MAE: {}'.format(np.round(mae, 3))) print('Average OOF MAE normalized: {}'.format(np.round(av_mae_norm, 3))) return maes, av_mae_norm
def train_models(custom_model, save_path, splits, hyperpars, overwrite_train, model_on_all_data, remove_overlap_chunks): (features, other_features, targets) = preprocess.get_preprocessed('train', remove_overlap_chunks, scale=True) train_val_split = preprocess.train_val_split(remove_overlap_chunks) num_folds = len(train_val_split[0]) num_train_models = num_folds + int(model_on_all_data) for split_id, split in enumerate(splits): print('Processing split {} of {}'.format(split_id + 1, len(splits))) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold + 1, num_train_models)) K.clear_session() fit_model(custom_model, save_path, split, fold, num_folds, train_val_split, hyperpars, overwrite_train, features, other_features, targets)
#################################################################################################### if is_train: for seed in SEED: # This use the context manager to operate in the data directory with cd(Name+f'-{seed}'): pickle.dump(sym_params, open("sym_params.sav", "wb")) logfile = open('log.txt','w+') resultfile = open('result.txt','w+') if os.path.exists('test.sav'): logfile.write('Did not calculate symfunctions.\n') else: data_dict = snn2sav(db, Name, elements, params_set, element_energy=element_energy) train_dict = train_test_split(data_dict,1-test_percent,seed=seed) train_val_split(train_dict,1-val_percent,seed=seed) logfile.flush() train_dict = torch.load('final_train.sav') val_dict = torch.load('final_val.sav') test_dict = torch.load('test.sav') scaling = get_scaling(train_dict, fp_scale_method, e_scale_method) n_nodes = hp['n_nodes'] activations = hp['activations'] lr = hp['lr'] model = MultiLayerNet(N_sym, n_nodes, activations, nelem, scaling=scaling) if opt_method == 'lbfgs':
def main(): idx = pd.IndexSlice date_col = 'start_date' target = pd.read_hdf(cfg.data_target_file) data = pd.read_hdf(cfg.data_cov_file) train_start_date = cfg.train_start_date end_date = cfg.end_date time_index = pd.date_range(train_start_date, end_date, freq='1D') existing_dates = [str(t[2]).split(" ")[0] for t in target.index] unique_dates = list(set(existing_dates)) target = target.loc[idx[:, :, unique_dates], :] data = data.loc[idx[unique_dates], :] cv_path = cfg.rootpath_cv forecast_path = cfg.forecast_rootpath target_var = cfg.target_var val_years = cfg.val_years test_years = cfg.test_years val_train_range = cfg.val_train_range test_train_range = cfg.test_train_range past_years = cfg.past_kyears val_range = cfg.val_range val_freq = cfg.val_freq test_start_date = cfg.test_start_date test_time_index_all = pd.date_range(test_start_date, end_date, freq='7D') # to create train-validation sets for year in val_years: for num_forecast in range(1, 2): preprocess.train_val_split(cv_path, data, target, target_var, year, num_forecast, train_range=val_train_range, past_years=past_years, test_range=val_range, test_freq=val_freq, n_jobs=20) # to create train-test sets for year in test_years: for num_forecast in range(1, 2): preprocess.train_test_split(forecast_path, data, target, target_var, test_time_index_all, year, num_forecast, train_range=test_train_range, past_years=past_years, n_jobs=20)