Exemple #1
0
def test_order(model_path, split, data_folder, hyperpars):
  # Determine the first eleven validation earthquake ids
  comp_rows_per_it = 4
  train_val_split = preprocess.train_val_split_gaps()
  num_folds = len(train_val_split[0])
  num_test_files = 2624
  
  fold_description = get_fold_description(num_folds, num_folds)
  ENCODER_PATH = '{}-{}-{}.h5'.format(model_path, split, fold_description)
  encoder_model = load_model(ENCODER_PATH, custom_objects={
        'Attention': models.Attention,
        'GradientReversal': models.GradientReversal,
        'tf': tf,})
  num_iterations = int(num_test_files/comp_rows_per_it)
  order_probs = np.zeros((num_test_files, num_test_files))
  
  for i in range(num_iterations):
    gc.collect()
    print('\nIteration {} of {}'.format(i+1, num_iterations))
    first_test_id = int(comp_rows_per_it*i)
    test_gen = utils.generator_cpc_batch_test(
        TEST_AUGMENT, hyperpars, encoder_model, first_test_id=first_test_id)
  
    # Generate the test data by calling the generator *N* times
    N = int(num_test_files/4*comp_rows_per_it)
    test_data = list(itertools.islice(test_gen, N))
    test_preds = make_predictions(model_path, split=-1, fold=-1, num_folds=-1,
                                  data=test_data, model=encoder_model)
    order_preds = test_preds[3][:, :, :4].mean(-1).reshape(
        [test_preds[3].shape[0], -1, 4]).mean(-1)
    order_probs[first_test_id:(first_test_id+comp_rows_per_it)] = (
        order_preds.reshape([comp_rows_per_it, -1]))
    
  save_path = data_folder + 'test_order_probs_raw_signal.npy'
  np.save(save_path, order_probs) # np.load(save_path)
Exemple #2
0
def validate_models(save_path, splits, model_on_all_data_only, max_fold,
                    remove_incomplete_eqs):
  train_val_split = preprocess.train_val_split_gaps()
  num_folds = len(train_val_split[0])
  if model_on_all_data_only:
    num_validation_models = num_folds+1
  else:
    num_validation_models = min(max_fold, num_folds)
  valid_maes = []
  for split_id, split in enumerate(splits):
    print('Processing split {} of {}'.format(split_id+1, len(splits)))
    split_maes = []
    for fold in range(num_validation_models):
      if not model_on_all_data_only or fold == num_folds:
        print('Processing fold {} of {}'.format(fold+1, num_validation_models))
        fold_mae = validate_model(
            save_path, split, fold, num_folds, train_val_split, hyperpars,
            hyperpars['validation_valid_batch'], remove_incomplete_eqs)
        split_maes.append((fold_mae))
    split_mae = np.array(split_maes).mean()
    split_maes = [split_mae] + split_maes 
    print('Split OOF MAE: {0:.3f}'.format(split_mae))
    valid_maes.append((split_maes, splits[split_id]))
    
  return valid_maes
def valid_order(model_path, split, data_folder):
  train_val_split = preprocess.train_val_split_gaps()
  num_folds = len(train_val_split[0])
  num_train_models = 1
  
  # Set the validation data to the first eleven validation earthquake ids since
  # the first validation earthquake
  first_val_ranges = train_val_split[split][0][1] # First fold, validation
  other_train_features = preprocess.get_preprocessed(
      'train', remove_overlap_chunks=True)[1]
  first_eq_id = other_train_features.eq_id.values[np.where(
      other_train_features.start_row.values == first_val_ranges[0][0])[0][0]]
  eq_ids = TRAIN_DATA.notrain_eq_id.values
  valid_rows = np.where(np.logical_and(eq_ids >= first_eq_id,
                                       eq_ids < (first_eq_id + 11)))[0]
  VALID_DATA = TRAIN_DATA.iloc[valid_rows]
  num_valid_files = int(VALID_DATA.shape[0]/(150000/hyperpars['block_steps']))
  
  order_probs = np.zeros((num_train_models, num_valid_files, num_valid_files))
  
  for fold in range(num_train_models):
    print('\nProcessing fold {} of {}'.format(fold+1, num_train_models))
    K.clear_session()
    encoder_model = load_model(ENCODER_PATH, custom_objects={
        'Attention': models.Attention})
    comp_rows_per_it = 4
    
    fold_description = get_fold_description(fold, num_folds)
    fold_model_path = '{}-{}-{}.h5'.format(model_path, split, fold_description)
    model = load_model(fold_model_path, custom_objects={
              'Attention': models.Attention,
              'GradientReversal': models.GradientReversal,
              'tf': tf,
              })
    
    num_iterations = int(num_valid_files/comp_rows_per_it)
    for i in range(num_iterations):
      gc.collect()
      print('\nIteration {} of {}'.format(i+1, num_iterations))
      first_test_id = int(comp_rows_per_it*i)
      test_gen = utils.generator_cpc_main_batch_test(
          VALID_DATA, hyperpars, encoder_model, first_test_id=first_test_id)
    
      # Generate the test data by calling the generator *N* times
      N = int(num_valid_files/4*comp_rows_per_it)
      test_data = list(itertools.islice(test_gen, N))
      test_preds = make_predictions(model_path, split=-1, fold=-1,
                                    num_folds=-1, data=test_data, model=model)
      order_preds = test_preds[3][:, :, :4].mean(-1).reshape(
          [test_preds[3].shape[0], -1, 4]).mean(-1)
      order_probs[fold, first_test_id:(first_test_id+comp_rows_per_it)] = (
          order_preds.reshape([comp_rows_per_it, -1]))
      
    save_path = data_folder + 'valid_order_probs.npy'
    np.save(save_path, order_probs) # np.load(save_path)
Exemple #4
0
def validate_save_gap_preds(model_path, save_path, split, hyperpars):
    # Determine the first eleven validation earthquake ids
    num_first_eqs = 11
    train_val_split = preprocess.train_val_split_gaps()
    first_val_ranges = train_val_split[split][0][1]  # First fold, validation
    other_train_features = preprocess.get_preprocessed(
        'train', remove_overlap_chunks=True)[1]
    first_eq_id_other_features = np.where(
        other_train_features.start_row.values == first_val_ranges[0][0])[0][0]
    first_eq_id = other_train_features.eq_id.values[first_eq_id_other_features]
    first_row_next_eq = other_train_features.start_row.values[np.where(
        other_train_features.eq_id.values == first_eq_id +
        num_first_eqs)[0][0]]
    first_valid_eq_ids = np.arange(first_val_ranges[0][0], first_row_next_eq)

    # Drop the last part of the valid_eq_ids that don't contain an entire chunk
    valid_file_steps = 150000
    new_eq_ids = np.where(
        np.diff(other_train_features.eq_id.values) > 0)[0] + 1
    drop_eq_end_ids = new_eq_ids[
        new_eq_ids > first_eq_id_other_features][:num_first_eqs]
    drop_ids = np.array([])
    for i in range(num_first_eqs):
        drop_ids_eq = np.arange(
            other_train_features.start_row.values[drop_eq_end_ids[i] - 2] +
            valid_file_steps,
            other_train_features.start_row.values[drop_eq_end_ids[i]])
        drop_ids = np.append(drop_ids, drop_ids_eq)

    first_valid_eq_ids = np.setdiff1d(first_valid_eq_ids,
                                      drop_ids,
                                      assume_unique=True)

    # Same logic as in test to generate the gap predicted probabilities
    x_valid = GAP_DATA.iloc[first_valid_eq_ids]
    #  x_valid = x_valid[:600000]
    num_valid_files = int(x_valid.shape[0] / valid_file_steps)
    x_valid = x_valid.iloc[np.arange(valid_file_steps * num_valid_files)]
    valid_ranges = (valid_file_steps * np.arange(num_valid_files),
                    valid_file_steps * (1 + np.arange(num_valid_files)) -
                    (hyperpars['block_steps']))
    (x_valid_batched, _, valid_start_rows) = utils.get_gap_prediction_features(
        x_valid, valid_ranges, hyperpars, order_start_rows=True)
    file_names = ['valid_' + str(i + 1) for i in range(num_valid_files)]
    valid_preds = make_predictions(model_path,
                                   split,
                                   fold=0,
                                   num_folds=1,
                                   x_features=x_valid_batched)
    valid_gap_preds_aligned = utils.align_test_gap_preds(
        valid_preds, valid_file_steps, valid_start_rows, hyperpars, file_names)
    data_path = save_path + '_aligned_predictions_valid' + '.csv'
    valid_gap_preds_aligned.to_csv(data_path, index=False)
Exemple #5
0
def train_models(custom_model, save_path, splits, hyperpars, overwrite_train,
                 model_on_all_data, max_fold):
    train_val_split = preprocess.train_val_split_gaps()
    num_folds = len(train_val_split[0])
    num_train_models = min(max_fold, num_folds + int(model_on_all_data))
    for split_id, split in enumerate(splits):
        print('Processing split {} of {}'.format(split_id + 1, len(splits)))
        for fold in range(num_train_models):
            print('\nProcessing fold {} of {}'.format(fold + 1,
                                                      num_train_models))
            K.clear_session()
            fit_model(custom_model, save_path, split, fold, num_folds,
                      train_val_split, hyperpars, overwrite_train)
def test_order(model_path, split, model_on_all_data, data_folder):
  train_val_split = preprocess.train_val_split_gaps()
  num_folds = len(train_val_split[0])
  num_train_models = num_folds + int(model_on_all_data)
  num_test_files = 2624
  order_probs = np.zeros((num_train_models, num_test_files, num_test_files))
  
  for fold in range(num_train_models):
    print('\nProcessing fold {} of {}'.format(fold+1, num_train_models))
    K.clear_session()
    encoder_model = load_model(ENCODER_PATH, custom_objects={
        'Attention': models.Attention})
    comp_rows_per_it = 16
    
    fold_description = get_fold_description(fold, num_folds)
    fold_model_path = '{}-{}-{}.h5'.format(model_path, split, fold_description)
    model = load_model(fold_model_path, custom_objects={
              'Attention': models.Attention,
              'GradientReversal': models.GradientReversal,
              'tf': tf,
              })
    
    num_iterations = int(num_test_files/comp_rows_per_it)
    for i in range(num_iterations):
      gc.collect()
      print('\nIteration {} of {}'.format(i+1, num_iterations))
      first_test_id = int(comp_rows_per_it*i)
      test_gen = utils.generator_cpc_main_batch_test(
          TEST_DATA, hyperpars, encoder_model, first_test_id=first_test_id)
    
      # Generate the test data by calling the generator *N* times
      N = int(num_test_files/4*comp_rows_per_it)
      test_data = list(itertools.islice(test_gen, N))
      test_preds = make_predictions(model_path, split=-1, fold=-1,
                                    num_folds=-1, data=test_data, model=model)
      order_preds = test_preds[3][:, :, :4].mean(-1).reshape(
          [test_preds[3].shape[0], -1, 4]).mean(-1)
      order_probs[fold, first_test_id:(first_test_id+comp_rows_per_it)] = (
          order_preds.reshape([comp_rows_per_it, -1]))
      
    save_path = data_folder + 'test_order_probs.npy'
    np.save(save_path, order_probs) # np.load(save_path)
Exemple #7
0
def validate_models(save_path, splits, max_fold):
    train_val_split = preprocess.train_val_split_gaps()
    num_folds = min(max_fold, len(train_val_split[0]))
    valid_ratios = []
    for split_id, split in enumerate(splits):
        print('Processing split {} of {}'.format(split_id + 1, len(splits)))
        sum_split_ratios = []
        split_ratios = []
        oof_counts = []
        for fold in range(num_folds):
            print('Processing fold {} of {}'.format(fold + 1, num_folds))
            fold_ratio, oof_count = get_fold_ratio(save_path, split, fold,
                                                   num_folds, train_val_split,
                                                   hyperpars)
            sum_split_ratios.append(fold_ratio * oof_count)
            split_ratios.append((fold_ratio, oof_count))
            oof_counts.append(oof_count)
        split_ratio = np.array(sum_split_ratios).sum() / np.array(
            oof_counts).sum()
        split_ratios = [split_ratio] + split_ratios
        print('Split OOF ratio: {}'.format(np.round(split_ratio, 3)))
        valid_ratios.append((split_ratios, splits[split_id]))
    return valid_ratios
Exemple #8
0
def valid_order(model_path, split, data_folder, hyperpars):
  # Determine the first eleven validation earthquake ids
  num_first_eqs = 11
  valid_file_steps = 150000
  comp_rows_per_it = 4
  train_val_split = preprocess.train_val_split_gaps()
  num_folds = len(train_val_split[0])
  
  
  first_val_ranges = train_val_split[split][0][1] # First fold, validation
  other_train_features = preprocess.get_preprocessed(
      'train', remove_overlap_chunks=True)[1]
  first_eq_id_other_features = np.where(
      other_train_features.start_row.values == first_val_ranges[0][0])[0][0]
  first_eq_id = other_train_features.eq_id.values[first_eq_id_other_features]
  first_row_next_eq = other_train_features.start_row.values[np.where(
      other_train_features.eq_id.values == first_eq_id+num_first_eqs)[0][0]]
  first_valid_eq_ids = np.arange(first_val_ranges[0][0], first_row_next_eq)
  
  # Drop the last part of the valid_eq_ids that don't contain an entire chunk
  new_eq_ids = np.where(np.diff(other_train_features.eq_id.values) > 0)[0] + 1
  drop_eq_end_ids = new_eq_ids[new_eq_ids > first_eq_id_other_features][
      :num_first_eqs]
  drop_ids = np.array([])
  for i in range(num_first_eqs):
    drop_ids_eq = np.arange(
        other_train_features.start_row.values[
            drop_eq_end_ids[i]-2]+valid_file_steps,
        other_train_features.start_row.values[drop_eq_end_ids[i]])
    drop_ids = np.append(drop_ids, drop_ids_eq)
    
  first_valid_eq_ids = np.setdiff1d(first_valid_eq_ids, drop_ids,
                                    assume_unique=True)
  
  # Same logic as in test to generate the gap predicted probabilities
  VALID_DATA = TRAIN_AUGMENT.iloc[first_valid_eq_ids]
#  VALID_DATA = VALID_DATA[:(150000*16)]
  num_valid_files = int(VALID_DATA.shape[0]/valid_file_steps)
  VALID_DATA = VALID_DATA.iloc[np.arange(valid_file_steps*num_valid_files)]
  
  fold_description = get_fold_description(num_folds, num_folds)
  ENCODER_PATH = '{}-{}-{}.h5'.format(model_path, split, fold_description)
  encoder_model = load_model(ENCODER_PATH, custom_objects={
        'Attention': models.Attention,
        'GradientReversal': models.GradientReversal,
        'tf': tf,})
  num_iterations = int(num_valid_files/comp_rows_per_it)
  order_probs = np.zeros((num_valid_files, num_valid_files))
  
  for i in range(num_iterations):
    gc.collect()
    print('\nIteration {} of {}'.format(i+1, num_iterations))
    first_test_id = int(comp_rows_per_it*i)
    test_gen = utils.generator_cpc_batch_test(
        VALID_DATA, hyperpars, encoder_model, first_test_id=first_test_id)
  
    # Generate the test data by calling the generator *N* times
    N = int(num_valid_files/4*comp_rows_per_it)
    test_data = list(itertools.islice(test_gen, N))
    test_preds = make_predictions(model_path, split=-1, fold=-1, num_folds=-1,
                                  data=test_data, model=encoder_model)
    order_preds = test_preds[3][:, :, :4].mean(-1).reshape(
        [test_preds[3].shape[0], -1, 4]).mean(-1)
    order_probs[first_test_id:(first_test_id+comp_rows_per_it)] = (
        order_preds.reshape([comp_rows_per_it, -1]))
    
  save_path = data_folder + 'valid_order_probs_raw_signal.npy'
  np.save(save_path, order_probs) # np.load(save_path)