def test_order(model_path, split, data_folder, hyperpars): # Determine the first eleven validation earthquake ids comp_rows_per_it = 4 train_val_split = preprocess.train_val_split_gaps() num_folds = len(train_val_split[0]) num_test_files = 2624 fold_description = get_fold_description(num_folds, num_folds) ENCODER_PATH = '{}-{}-{}.h5'.format(model_path, split, fold_description) encoder_model = load_model(ENCODER_PATH, custom_objects={ 'Attention': models.Attention, 'GradientReversal': models.GradientReversal, 'tf': tf,}) num_iterations = int(num_test_files/comp_rows_per_it) order_probs = np.zeros((num_test_files, num_test_files)) for i in range(num_iterations): gc.collect() print('\nIteration {} of {}'.format(i+1, num_iterations)) first_test_id = int(comp_rows_per_it*i) test_gen = utils.generator_cpc_batch_test( TEST_AUGMENT, hyperpars, encoder_model, first_test_id=first_test_id) # Generate the test data by calling the generator *N* times N = int(num_test_files/4*comp_rows_per_it) test_data = list(itertools.islice(test_gen, N)) test_preds = make_predictions(model_path, split=-1, fold=-1, num_folds=-1, data=test_data, model=encoder_model) order_preds = test_preds[3][:, :, :4].mean(-1).reshape( [test_preds[3].shape[0], -1, 4]).mean(-1) order_probs[first_test_id:(first_test_id+comp_rows_per_it)] = ( order_preds.reshape([comp_rows_per_it, -1])) save_path = data_folder + 'test_order_probs_raw_signal.npy' np.save(save_path, order_probs) # np.load(save_path)
def validate_models(save_path, splits, model_on_all_data_only, max_fold, remove_incomplete_eqs): train_val_split = preprocess.train_val_split_gaps() num_folds = len(train_val_split[0]) if model_on_all_data_only: num_validation_models = num_folds+1 else: num_validation_models = min(max_fold, num_folds) valid_maes = [] for split_id, split in enumerate(splits): print('Processing split {} of {}'.format(split_id+1, len(splits))) split_maes = [] for fold in range(num_validation_models): if not model_on_all_data_only or fold == num_folds: print('Processing fold {} of {}'.format(fold+1, num_validation_models)) fold_mae = validate_model( save_path, split, fold, num_folds, train_val_split, hyperpars, hyperpars['validation_valid_batch'], remove_incomplete_eqs) split_maes.append((fold_mae)) split_mae = np.array(split_maes).mean() split_maes = [split_mae] + split_maes print('Split OOF MAE: {0:.3f}'.format(split_mae)) valid_maes.append((split_maes, splits[split_id])) return valid_maes
def valid_order(model_path, split, data_folder): train_val_split = preprocess.train_val_split_gaps() num_folds = len(train_val_split[0]) num_train_models = 1 # Set the validation data to the first eleven validation earthquake ids since # the first validation earthquake first_val_ranges = train_val_split[split][0][1] # First fold, validation other_train_features = preprocess.get_preprocessed( 'train', remove_overlap_chunks=True)[1] first_eq_id = other_train_features.eq_id.values[np.where( other_train_features.start_row.values == first_val_ranges[0][0])[0][0]] eq_ids = TRAIN_DATA.notrain_eq_id.values valid_rows = np.where(np.logical_and(eq_ids >= first_eq_id, eq_ids < (first_eq_id + 11)))[0] VALID_DATA = TRAIN_DATA.iloc[valid_rows] num_valid_files = int(VALID_DATA.shape[0]/(150000/hyperpars['block_steps'])) order_probs = np.zeros((num_train_models, num_valid_files, num_valid_files)) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold+1, num_train_models)) K.clear_session() encoder_model = load_model(ENCODER_PATH, custom_objects={ 'Attention': models.Attention}) comp_rows_per_it = 4 fold_description = get_fold_description(fold, num_folds) fold_model_path = '{}-{}-{}.h5'.format(model_path, split, fold_description) model = load_model(fold_model_path, custom_objects={ 'Attention': models.Attention, 'GradientReversal': models.GradientReversal, 'tf': tf, }) num_iterations = int(num_valid_files/comp_rows_per_it) for i in range(num_iterations): gc.collect() print('\nIteration {} of {}'.format(i+1, num_iterations)) first_test_id = int(comp_rows_per_it*i) test_gen = utils.generator_cpc_main_batch_test( VALID_DATA, hyperpars, encoder_model, first_test_id=first_test_id) # Generate the test data by calling the generator *N* times N = int(num_valid_files/4*comp_rows_per_it) test_data = list(itertools.islice(test_gen, N)) test_preds = make_predictions(model_path, split=-1, fold=-1, num_folds=-1, data=test_data, model=model) order_preds = test_preds[3][:, :, :4].mean(-1).reshape( [test_preds[3].shape[0], -1, 4]).mean(-1) order_probs[fold, first_test_id:(first_test_id+comp_rows_per_it)] = ( order_preds.reshape([comp_rows_per_it, -1])) save_path = data_folder + 'valid_order_probs.npy' np.save(save_path, order_probs) # np.load(save_path)
def validate_save_gap_preds(model_path, save_path, split, hyperpars): # Determine the first eleven validation earthquake ids num_first_eqs = 11 train_val_split = preprocess.train_val_split_gaps() first_val_ranges = train_val_split[split][0][1] # First fold, validation other_train_features = preprocess.get_preprocessed( 'train', remove_overlap_chunks=True)[1] first_eq_id_other_features = np.where( other_train_features.start_row.values == first_val_ranges[0][0])[0][0] first_eq_id = other_train_features.eq_id.values[first_eq_id_other_features] first_row_next_eq = other_train_features.start_row.values[np.where( other_train_features.eq_id.values == first_eq_id + num_first_eqs)[0][0]] first_valid_eq_ids = np.arange(first_val_ranges[0][0], first_row_next_eq) # Drop the last part of the valid_eq_ids that don't contain an entire chunk valid_file_steps = 150000 new_eq_ids = np.where( np.diff(other_train_features.eq_id.values) > 0)[0] + 1 drop_eq_end_ids = new_eq_ids[ new_eq_ids > first_eq_id_other_features][:num_first_eqs] drop_ids = np.array([]) for i in range(num_first_eqs): drop_ids_eq = np.arange( other_train_features.start_row.values[drop_eq_end_ids[i] - 2] + valid_file_steps, other_train_features.start_row.values[drop_eq_end_ids[i]]) drop_ids = np.append(drop_ids, drop_ids_eq) first_valid_eq_ids = np.setdiff1d(first_valid_eq_ids, drop_ids, assume_unique=True) # Same logic as in test to generate the gap predicted probabilities x_valid = GAP_DATA.iloc[first_valid_eq_ids] # x_valid = x_valid[:600000] num_valid_files = int(x_valid.shape[0] / valid_file_steps) x_valid = x_valid.iloc[np.arange(valid_file_steps * num_valid_files)] valid_ranges = (valid_file_steps * np.arange(num_valid_files), valid_file_steps * (1 + np.arange(num_valid_files)) - (hyperpars['block_steps'])) (x_valid_batched, _, valid_start_rows) = utils.get_gap_prediction_features( x_valid, valid_ranges, hyperpars, order_start_rows=True) file_names = ['valid_' + str(i + 1) for i in range(num_valid_files)] valid_preds = make_predictions(model_path, split, fold=0, num_folds=1, x_features=x_valid_batched) valid_gap_preds_aligned = utils.align_test_gap_preds( valid_preds, valid_file_steps, valid_start_rows, hyperpars, file_names) data_path = save_path + '_aligned_predictions_valid' + '.csv' valid_gap_preds_aligned.to_csv(data_path, index=False)
def train_models(custom_model, save_path, splits, hyperpars, overwrite_train, model_on_all_data, max_fold): train_val_split = preprocess.train_val_split_gaps() num_folds = len(train_val_split[0]) num_train_models = min(max_fold, num_folds + int(model_on_all_data)) for split_id, split in enumerate(splits): print('Processing split {} of {}'.format(split_id + 1, len(splits))) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold + 1, num_train_models)) K.clear_session() fit_model(custom_model, save_path, split, fold, num_folds, train_val_split, hyperpars, overwrite_train)
def test_order(model_path, split, model_on_all_data, data_folder): train_val_split = preprocess.train_val_split_gaps() num_folds = len(train_val_split[0]) num_train_models = num_folds + int(model_on_all_data) num_test_files = 2624 order_probs = np.zeros((num_train_models, num_test_files, num_test_files)) for fold in range(num_train_models): print('\nProcessing fold {} of {}'.format(fold+1, num_train_models)) K.clear_session() encoder_model = load_model(ENCODER_PATH, custom_objects={ 'Attention': models.Attention}) comp_rows_per_it = 16 fold_description = get_fold_description(fold, num_folds) fold_model_path = '{}-{}-{}.h5'.format(model_path, split, fold_description) model = load_model(fold_model_path, custom_objects={ 'Attention': models.Attention, 'GradientReversal': models.GradientReversal, 'tf': tf, }) num_iterations = int(num_test_files/comp_rows_per_it) for i in range(num_iterations): gc.collect() print('\nIteration {} of {}'.format(i+1, num_iterations)) first_test_id = int(comp_rows_per_it*i) test_gen = utils.generator_cpc_main_batch_test( TEST_DATA, hyperpars, encoder_model, first_test_id=first_test_id) # Generate the test data by calling the generator *N* times N = int(num_test_files/4*comp_rows_per_it) test_data = list(itertools.islice(test_gen, N)) test_preds = make_predictions(model_path, split=-1, fold=-1, num_folds=-1, data=test_data, model=model) order_preds = test_preds[3][:, :, :4].mean(-1).reshape( [test_preds[3].shape[0], -1, 4]).mean(-1) order_probs[fold, first_test_id:(first_test_id+comp_rows_per_it)] = ( order_preds.reshape([comp_rows_per_it, -1])) save_path = data_folder + 'test_order_probs.npy' np.save(save_path, order_probs) # np.load(save_path)
def validate_models(save_path, splits, max_fold): train_val_split = preprocess.train_val_split_gaps() num_folds = min(max_fold, len(train_val_split[0])) valid_ratios = [] for split_id, split in enumerate(splits): print('Processing split {} of {}'.format(split_id + 1, len(splits))) sum_split_ratios = [] split_ratios = [] oof_counts = [] for fold in range(num_folds): print('Processing fold {} of {}'.format(fold + 1, num_folds)) fold_ratio, oof_count = get_fold_ratio(save_path, split, fold, num_folds, train_val_split, hyperpars) sum_split_ratios.append(fold_ratio * oof_count) split_ratios.append((fold_ratio, oof_count)) oof_counts.append(oof_count) split_ratio = np.array(sum_split_ratios).sum() / np.array( oof_counts).sum() split_ratios = [split_ratio] + split_ratios print('Split OOF ratio: {}'.format(np.round(split_ratio, 3))) valid_ratios.append((split_ratios, splits[split_id])) return valid_ratios
def valid_order(model_path, split, data_folder, hyperpars): # Determine the first eleven validation earthquake ids num_first_eqs = 11 valid_file_steps = 150000 comp_rows_per_it = 4 train_val_split = preprocess.train_val_split_gaps() num_folds = len(train_val_split[0]) first_val_ranges = train_val_split[split][0][1] # First fold, validation other_train_features = preprocess.get_preprocessed( 'train', remove_overlap_chunks=True)[1] first_eq_id_other_features = np.where( other_train_features.start_row.values == first_val_ranges[0][0])[0][0] first_eq_id = other_train_features.eq_id.values[first_eq_id_other_features] first_row_next_eq = other_train_features.start_row.values[np.where( other_train_features.eq_id.values == first_eq_id+num_first_eqs)[0][0]] first_valid_eq_ids = np.arange(first_val_ranges[0][0], first_row_next_eq) # Drop the last part of the valid_eq_ids that don't contain an entire chunk new_eq_ids = np.where(np.diff(other_train_features.eq_id.values) > 0)[0] + 1 drop_eq_end_ids = new_eq_ids[new_eq_ids > first_eq_id_other_features][ :num_first_eqs] drop_ids = np.array([]) for i in range(num_first_eqs): drop_ids_eq = np.arange( other_train_features.start_row.values[ drop_eq_end_ids[i]-2]+valid_file_steps, other_train_features.start_row.values[drop_eq_end_ids[i]]) drop_ids = np.append(drop_ids, drop_ids_eq) first_valid_eq_ids = np.setdiff1d(first_valid_eq_ids, drop_ids, assume_unique=True) # Same logic as in test to generate the gap predicted probabilities VALID_DATA = TRAIN_AUGMENT.iloc[first_valid_eq_ids] # VALID_DATA = VALID_DATA[:(150000*16)] num_valid_files = int(VALID_DATA.shape[0]/valid_file_steps) VALID_DATA = VALID_DATA.iloc[np.arange(valid_file_steps*num_valid_files)] fold_description = get_fold_description(num_folds, num_folds) ENCODER_PATH = '{}-{}-{}.h5'.format(model_path, split, fold_description) encoder_model = load_model(ENCODER_PATH, custom_objects={ 'Attention': models.Attention, 'GradientReversal': models.GradientReversal, 'tf': tf,}) num_iterations = int(num_valid_files/comp_rows_per_it) order_probs = np.zeros((num_valid_files, num_valid_files)) for i in range(num_iterations): gc.collect() print('\nIteration {} of {}'.format(i+1, num_iterations)) first_test_id = int(comp_rows_per_it*i) test_gen = utils.generator_cpc_batch_test( VALID_DATA, hyperpars, encoder_model, first_test_id=first_test_id) # Generate the test data by calling the generator *N* times N = int(num_valid_files/4*comp_rows_per_it) test_data = list(itertools.islice(test_gen, N)) test_preds = make_predictions(model_path, split=-1, fold=-1, num_folds=-1, data=test_data, model=encoder_model) order_preds = test_preds[3][:, :, :4].mean(-1).reshape( [test_preds[3].shape[0], -1, 4]).mean(-1) order_probs[first_test_id:(first_test_id+comp_rows_per_it)] = ( order_preds.reshape([comp_rows_per_it, -1])) save_path = data_folder + 'valid_order_probs_raw_signal.npy' np.save(save_path, order_probs) # np.load(save_path)