def test_model(save_path, split, test_on_all_data): (x_test, other_test, _) = preprocess.get_preprocessed('test', remove_overlap_chunks=True, scale=NORMALIZE_FEATURES) train_val_split = preprocess.train_val_split(remove_overlap_chunks=True) num_folds = len(train_val_split[split]) num_test = x_test.shape[0] num_prediction_models = num_folds + int(test_on_all_data) model_preds = np.zeros((num_test, num_prediction_models)) for fold in range(num_prediction_models): print("Making test predictions {} of {}".format( fold + 1, num_prediction_models)) fold_description = get_fold_description(fold, num_folds) model_path = '{}-{}-{}.txt'.format(save_path, split, fold_description) model = lgb.Booster(model_file=model_path) model_preds[:, fold] = model.predict(x_test) # Write the output pandas data frame preds_test = np.mean(model_preds, 1) submission = pd.read_csv( '/home/tom/Kaggle/LANL/Data/sample_submission.csv') submission.time_to_failure = preds_test the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M') submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv' submission.to_csv(submission_path, index=False)
def train_models(save_path, hyperpars, overwrite_train, train_on_all_data, remove_overlap_chunks, train_all_previous, target_quantile, train_last_six_complete=False): (features, other_features, targets) = preprocess.get_preprocessed( 'train', remove_overlap_chunks, scale=NORMALIZE_FEATURES, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, target_quantile=target_quantile, train_last_six_complete=train_last_six_complete) train_val_split = preprocess.train_val_split( remove_overlap_chunks, ordered=True, num_folds=NUM_FOLDS, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, train_all_previous=train_all_previous, target_quantile=target_quantile, train_last_six_complete=train_last_six_complete) num_folds = len(train_val_split) num_train_models = num_folds + int(train_on_all_data) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold + 1, num_train_models)) fit_model(save_path, fold, num_folds, train_val_split, hyperpars, overwrite_train, features, other_features, targets, target_quantile)
def validate_models(save_path, splits, remove_overlap_chunks): (features, other_features, targets) = preprocess.get_preprocessed('train', remove_overlap_chunks, scale=NORMALIZE_FEATURES) train_val_split = preprocess.train_val_split(remove_overlap_chunks) num_folds = len(train_val_split[0]) valid_maes = [] for split_id, split in enumerate(splits): print('Processing split {} of {}'.format(split_id + 1, len(splits))) sum_split_maes = [] split_maes = [] for fold in range(num_folds): print('Processing fold {} of {}'.format(fold + 1, num_folds)) fold_mae, oof_count = get_fold_mae(save_path, split, fold, num_folds, train_val_split, features, targets) sum_split_maes.append(fold_mae * oof_count) split_maes.append((fold_mae, oof_count)) split_mae = np.array(sum_split_maes).sum() / features.shape[0] split_maes = [split_mae] + split_maes print('Split OOF MAE: {}'.format(np.round(split_mae, 3))) valid_maes.append((split_maes, splits[split_id])) return valid_maes
def test_model(save_path, split, model_on_all_data): (x_test, other_test, _) = preprocess.get_preprocessed('test', remove_overlap_chunks=True, scale=True) (x_test_reshaped, _) = utils.reshape_time_dim(x_test, np.zeros_like(x_test), np.arange(x_test.shape[0])) train_val_split = preprocess.train_val_split(remove_overlap_chunks=True) num_folds = len(train_val_split[split]) num_test = x_test.shape[0] num_prediction_models = num_folds + int(model_on_all_data) model_preds = np.zeros((num_test, num_prediction_models)) for fold in range(num_prediction_models): print("Making test predictions {} of {}".format( fold + 1, num_prediction_models)) model_preds[:, fold] = make_predictions(save_path, split, fold, num_folds, x_test_reshaped) # Write the output pandas data frame preds_test = np.mean(model_preds, 1) submission = pd.read_csv( '/home/tom/Kaggle/LANL/Data/sample_submission.csv') submission.time_to_failure = preds_test the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M') submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv' submission.to_csv(submission_path, index=False)
def valid_order(model_path, split, data_folder): train_val_split = preprocess.train_val_split_gaps() num_folds = len(train_val_split[0]) num_train_models = 1 # Set the validation data to the first eleven validation earthquake ids since # the first validation earthquake first_val_ranges = train_val_split[split][0][1] # First fold, validation other_train_features = preprocess.get_preprocessed( 'train', remove_overlap_chunks=True)[1] first_eq_id = other_train_features.eq_id.values[np.where( other_train_features.start_row.values == first_val_ranges[0][0])[0][0]] eq_ids = TRAIN_DATA.notrain_eq_id.values valid_rows = np.where(np.logical_and(eq_ids >= first_eq_id, eq_ids < (first_eq_id + 11)))[0] VALID_DATA = TRAIN_DATA.iloc[valid_rows] num_valid_files = int(VALID_DATA.shape[0]/(150000/hyperpars['block_steps'])) order_probs = np.zeros((num_train_models, num_valid_files, num_valid_files)) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold+1, num_train_models)) K.clear_session() encoder_model = load_model(ENCODER_PATH, custom_objects={ 'Attention': models.Attention}) comp_rows_per_it = 4 fold_description = get_fold_description(fold, num_folds) fold_model_path = '{}-{}-{}.h5'.format(model_path, split, fold_description) model = load_model(fold_model_path, custom_objects={ 'Attention': models.Attention, 'GradientReversal': models.GradientReversal, 'tf': tf, }) num_iterations = int(num_valid_files/comp_rows_per_it) for i in range(num_iterations): gc.collect() print('\nIteration {} of {}'.format(i+1, num_iterations)) first_test_id = int(comp_rows_per_it*i) test_gen = utils.generator_cpc_main_batch_test( VALID_DATA, hyperpars, encoder_model, first_test_id=first_test_id) # Generate the test data by calling the generator *N* times N = int(num_valid_files/4*comp_rows_per_it) test_data = list(itertools.islice(test_gen, N)) test_preds = make_predictions(model_path, split=-1, fold=-1, num_folds=-1, data=test_data, model=model) order_preds = test_preds[3][:, :, :4].mean(-1).reshape( [test_preds[3].shape[0], -1, 4]).mean(-1) order_probs[fold, first_test_id:(first_test_id+comp_rows_per_it)] = ( order_preds.reshape([comp_rows_per_it, -1])) save_path = data_folder + 'valid_order_probs.npy' np.save(save_path, order_probs) # np.load(save_path)
def validate_save_gap_preds(model_path, save_path, split, hyperpars): # Determine the first eleven validation earthquake ids num_first_eqs = 11 train_val_split = preprocess.train_val_split_gaps() first_val_ranges = train_val_split[split][0][1] # First fold, validation other_train_features = preprocess.get_preprocessed( 'train', remove_overlap_chunks=True)[1] first_eq_id_other_features = np.where( other_train_features.start_row.values == first_val_ranges[0][0])[0][0] first_eq_id = other_train_features.eq_id.values[first_eq_id_other_features] first_row_next_eq = other_train_features.start_row.values[np.where( other_train_features.eq_id.values == first_eq_id + num_first_eqs)[0][0]] first_valid_eq_ids = np.arange(first_val_ranges[0][0], first_row_next_eq) # Drop the last part of the valid_eq_ids that don't contain an entire chunk valid_file_steps = 150000 new_eq_ids = np.where( np.diff(other_train_features.eq_id.values) > 0)[0] + 1 drop_eq_end_ids = new_eq_ids[ new_eq_ids > first_eq_id_other_features][:num_first_eqs] drop_ids = np.array([]) for i in range(num_first_eqs): drop_ids_eq = np.arange( other_train_features.start_row.values[drop_eq_end_ids[i] - 2] + valid_file_steps, other_train_features.start_row.values[drop_eq_end_ids[i]]) drop_ids = np.append(drop_ids, drop_ids_eq) first_valid_eq_ids = np.setdiff1d(first_valid_eq_ids, drop_ids, assume_unique=True) # Same logic as in test to generate the gap predicted probabilities x_valid = GAP_DATA.iloc[first_valid_eq_ids] # x_valid = x_valid[:600000] num_valid_files = int(x_valid.shape[0] / valid_file_steps) x_valid = x_valid.iloc[np.arange(valid_file_steps * num_valid_files)] valid_ranges = (valid_file_steps * np.arange(num_valid_files), valid_file_steps * (1 + np.arange(num_valid_files)) - (hyperpars['block_steps'])) (x_valid_batched, _, valid_start_rows) = utils.get_gap_prediction_features( x_valid, valid_ranges, hyperpars, order_start_rows=True) file_names = ['valid_' + str(i + 1) for i in range(num_valid_files)] valid_preds = make_predictions(model_path, split, fold=0, num_folds=1, x_features=x_valid_batched) valid_gap_preds_aligned = utils.align_test_gap_preds( valid_preds, valid_file_steps, valid_start_rows, hyperpars, file_names) data_path = save_path + '_aligned_predictions_valid' + '.csv' valid_gap_preds_aligned.to_csv(data_path, index=False)
def test_model(save_path, test_on_all_folds, train_all_previous, target_quantile, median_test_cyle_length, seed_ext=None, train_last_six_complete=False, drop_first_test_fold=False): (x_test, other_test, _) = preprocess.get_preprocessed( 'test', remove_overlap_chunks=True, scale=NORMALIZE_FEATURES, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, target_quantile=target_quantile) train_val_split = preprocess.train_val_split( ordered=True, remove_overlap_chunks=True, num_folds=NUM_FOLDS, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, train_all_previous=train_all_previous, target_quantile=target_quantile) num_folds = len(train_val_split) num_folds = 1 if train_last_six_complete else num_folds pred_folds = [f for f in range(num_folds) ] if test_on_all_folds else [num_folds - 1] model_preds = np.zeros((x_test.shape[0], len(pred_folds))) for (i, fold) in enumerate(pred_folds): print("Making test predictions {} of {}".format( i + 1, len(pred_folds))) fold_description = get_fold_description(fold, num_folds) model_path = '{}-{}.txt'.format(save_path, fold_description) model = lgb.Booster(model_file=model_path) model_preds[:, i] = model.predict(x_test) model_preds = model_preds[:, 1:] if drop_first_test_fold else model_preds preds_test = np.mean(model_preds, 1) if target_quantile: preds_test = median_test_cyle_length * (1 - preds_test) # Write the output pandas data frame submission = pd.read_csv(DATA_FOLDER + 'sample_submission.csv') submission.time_to_failure = preds_test the_date = datetime.datetime.now().strftime('%y-%m-%d-%H-%M') the_date = the_date if seed_ext is None else the_date + seed_ext submission_path = '/home/tom/Kaggle/LANL/Submissions/' + the_date + '.csv' submission.to_csv(submission_path, index=False)
def train_models(save_path, splits, hyperpars, overwrite_train, early_stopping, train_on_all_data, remove_overlap_chunks): (features, other_features, targets) = preprocess.get_preprocessed('train', remove_overlap_chunks, scale=NORMALIZE_FEATURES) train_val_split = preprocess.train_val_split(remove_overlap_chunks) num_folds = len(train_val_split[0]) num_train_models = num_folds + int(train_on_all_data) for split_id, split in enumerate(splits): print('Processing split {} of {}'.format(split_id + 1, len(splits))) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold + 1, num_train_models)) fit_model(save_path, split, fold, num_folds, train_val_split, hyperpars, overwrite_train, features, other_features, targets, early_stopping)
def validate_models(save_path, remove_overlap_chunks, train_all_previous, target_quantile, train_last_six_complete=False): (features, other_features, targets) = preprocess.get_preprocessed( 'train', remove_overlap_chunks, scale=NORMALIZE_FEATURES, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, target_quantile=target_quantile) train_val_split = preprocess.train_val_split( remove_overlap_chunks, ordered=True, num_folds=NUM_FOLDS, remove_incomplete_eqs=REMOVE_INCOMPLETE_EQS, train_all_previous=train_all_previous, target_quantile=target_quantile) if target_quantile: targets = other_features.target_original.values num_folds = len(train_val_split) sum_maes = [] maes = [] fold_mae_norms = [] total_count = 0 for fold in range(num_folds): print('\nProcessing fold {} of {}'.format(fold + 1, num_folds)) fold_mae, fold_mae_norm, oof_count = get_fold_mae( save_path, fold, num_folds, train_val_split, features, targets, target_quantile) sum_maes.append(fold_mae * oof_count) maes.append((fold_mae, oof_count)) fold_mae_norms.append(fold_mae_norm) total_count += oof_count av_mae_norm = np.array([ n * c for (n, c) in zip(fold_mae_norms, [c for (m, c) in maes]) ]).sum() / total_count mae = np.array(sum_maes).sum() / total_count maes = [mae] + maes print('\nAverage OOF MAE: {}'.format(np.round(mae, 3))) print('Average OOF MAE normalized: {}'.format(np.round(av_mae_norm, 3))) return maes, av_mae_norm
def train_models(custom_model, save_path, splits, hyperpars, overwrite_train, model_on_all_data, remove_overlap_chunks): (features, other_features, targets) = preprocess.get_preprocessed('train', remove_overlap_chunks, scale=True) train_val_split = preprocess.train_val_split(remove_overlap_chunks) num_folds = len(train_val_split[0]) num_train_models = num_folds + int(model_on_all_data) for split_id, split in enumerate(splits): print('Processing split {} of {}'.format(split_id + 1, len(splits))) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold + 1, num_train_models)) K.clear_session() fit_model(custom_model, save_path, split, fold, num_folds, train_val_split, hyperpars, overwrite_train, features, other_features, targets)
def valid_order(model_path, split, data_folder, hyperpars): # Determine the first eleven validation earthquake ids num_first_eqs = 11 valid_file_steps = 150000 comp_rows_per_it = 4 train_val_split = preprocess.train_val_split_gaps() num_folds = len(train_val_split[0]) first_val_ranges = train_val_split[split][0][1] # First fold, validation other_train_features = preprocess.get_preprocessed( 'train', remove_overlap_chunks=True)[1] first_eq_id_other_features = np.where( other_train_features.start_row.values == first_val_ranges[0][0])[0][0] first_eq_id = other_train_features.eq_id.values[first_eq_id_other_features] first_row_next_eq = other_train_features.start_row.values[np.where( other_train_features.eq_id.values == first_eq_id+num_first_eqs)[0][0]] first_valid_eq_ids = np.arange(first_val_ranges[0][0], first_row_next_eq) # Drop the last part of the valid_eq_ids that don't contain an entire chunk new_eq_ids = np.where(np.diff(other_train_features.eq_id.values) > 0)[0] + 1 drop_eq_end_ids = new_eq_ids[new_eq_ids > first_eq_id_other_features][ :num_first_eqs] drop_ids = np.array([]) for i in range(num_first_eqs): drop_ids_eq = np.arange( other_train_features.start_row.values[ drop_eq_end_ids[i]-2]+valid_file_steps, other_train_features.start_row.values[drop_eq_end_ids[i]]) drop_ids = np.append(drop_ids, drop_ids_eq) first_valid_eq_ids = np.setdiff1d(first_valid_eq_ids, drop_ids, assume_unique=True) # Same logic as in test to generate the gap predicted probabilities VALID_DATA = TRAIN_AUGMENT.iloc[first_valid_eq_ids] # VALID_DATA = VALID_DATA[:(150000*16)] num_valid_files = int(VALID_DATA.shape[0]/valid_file_steps) VALID_DATA = VALID_DATA.iloc[np.arange(valid_file_steps*num_valid_files)] fold_description = get_fold_description(num_folds, num_folds) ENCODER_PATH = '{}-{}-{}.h5'.format(model_path, split, fold_description) encoder_model = load_model(ENCODER_PATH, custom_objects={ 'Attention': models.Attention, 'GradientReversal': models.GradientReversal, 'tf': tf,}) num_iterations = int(num_valid_files/comp_rows_per_it) order_probs = np.zeros((num_valid_files, num_valid_files)) for i in range(num_iterations): gc.collect() print('\nIteration {} of {}'.format(i+1, num_iterations)) first_test_id = int(comp_rows_per_it*i) test_gen = utils.generator_cpc_batch_test( VALID_DATA, hyperpars, encoder_model, first_test_id=first_test_id) # Generate the test data by calling the generator *N* times N = int(num_valid_files/4*comp_rows_per_it) test_data = list(itertools.islice(test_gen, N)) test_preds = make_predictions(model_path, split=-1, fold=-1, num_folds=-1, data=test_data, model=encoder_model) order_preds = test_preds[3][:, :, :4].mean(-1).reshape( [test_preds[3].shape[0], -1, 4]).mean(-1) order_probs[first_test_id:(first_test_id+comp_rows_per_it)] = ( order_preds.reshape([comp_rows_per_it, -1])) save_path = data_folder + 'valid_order_probs_raw_signal.npy' np.save(save_path, order_probs) # np.load(save_path)