def run(try_num, config): args = get_args() print('args', args, flush=True) print('config:', config.to_dict(), flush=True) set_seed(config.rand_seed) pretrained_model = f"tf_efficientnet_b3_ns" model_dir = f'deepinsight-{try_num}' if not os.path.exists(model_dir): os.mkdir(model_dir) train_features = pd.read_csv(f"../input/lish-moa/train_features.csv") train_targets = pd.read_csv(f"../input/lish-moa/train_targets_scored.csv") test_features = pd.read_csv(f"../input/lish-moa/test_features.csv") if config.dae_path: dae_features = pd.read_csv(config.dae_path) if args.debug: train_features = train_features.iloc[:500] train_targets = train_targets.iloc[:500] if config.dae_path: dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True) config.update(dict( kfolds=3, n_epoch=3 )) train_features = train_features.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True) train_targets = train_targets.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True) cat_features_columns = ["cp_dose", 'cp_time'] num_feature_columns = [c for c in train_features.columns if c != "sig_id" and c not in cat_features_columns + ['cp_type']] all_features_columns = cat_features_columns + num_feature_columns target_columns = [c for c in train_targets.columns if c != "sig_id"] g_feature_columns = [c for c in num_feature_columns if c.startswith("g-")] c_feature_columns = [c for c in num_feature_columns if c.startswith("c-")] if config.dae_path: if config.dae_strategy == 'replace': train_features, test_features = assign_dae_features( train_features, test_features, dae_features, len(num_feature_columns)) else: train_features, test_features, dae_feature_columns = merge_dae_features( train_features, test_features, dae_features, len(g_feature_columns), len(c_feature_columns)) all_features_columns += dae_feature_columns train_targets = train_targets.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) train_features = train_features.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) if config.normalizer == 'rank': train_features, test_features = normalize(train_features, test_features, num_feature_columns) for df in [train_features, test_features]: df['cp_type'] = df['cp_type'].map({'ctl_vehicle': 0, 'trt_cp': 1}) df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1}) df['cp_time'] = df['cp_time'].map({24: 0, 48: 0.5, 72: 1}) if config.variance_target_type == 1: pickle_path = f'{model_dir}/variance_reduction.pkl' variance_target_features = num_feature_columns if config.dae_path and config.dae_strategy != 'replace': variance_target_features += dae_feature_columns if not os.path.exists(pickle_path): vt = variance_reduction_fit(train_features, variance_target_features, config.variance_threshold) save_pickle(vt, pickle_path) vt = load_pickle(pickle_path) train_features = variance_reduction_transform(vt, train_features, variance_target_features) test_features = variance_reduction_transform(vt, test_features, variance_target_features) print('(variance_reduction) Number of features after applying:', len(train_features.columns), flush=True) all_features_columns = list(train_features.columns[1:]) skf = MultilabelStratifiedKFold(n_splits=config.kfolds, shuffle=True, random_state=config.rand_seed) y_labels = np.sum(train_targets.drop("sig_id", axis=1), axis=0).index.tolist() logger = Logger() for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])): if args.only_pred: print('Skip training', flush=True) break print(f'Fold: {fold_index}', train_index.shape, val_index.shape, flush=True) X_train = train_features.loc[train_index, all_features_columns].copy().values y_train = train_targets.iloc[train_index, 1:].copy().values X_valid = train_features.loc[val_index, all_features_columns].copy().values y_valid = train_targets.iloc[val_index, 1:].copy().values if config.normalizer == 'log': scaler = LogScaler() if config.norm_apply_all: scaler.fit(X_train) X_train = scaler.transform(X_train) X_valid = scaler.transform(X_valid) else: target_features = [i for i, c in enumerate(all_features_columns) if c in num_feature_columns] non_target_features = [i for i, c in enumerate(all_features_columns) if c not in num_feature_columns] scaler.fit(X_train[:, target_features]) X_train_tr = scaler.transform(X_train[:, target_features]) X_valid_tr = scaler.transform(X_valid[:, target_features]) X_train = np.concatenate([X_train[:, non_target_features], X_train_tr], axis=1) X_valid = np.concatenate([X_valid[:, non_target_features], X_valid_tr], axis=1) save_pickle(scaler, f'{model_dir}/scaler-{fold_index}.pkl') transformer = DeepInsightTransformer( feature_extractor=config.extractor, pixels=config.resolution, perplexity=config.perplexity, random_state=config.rand_seed, n_jobs=-1 ).fit(X_train) save_pickle(transformer, f'{model_dir}/transformer-{fold_index}.pkl') model = MoAEfficientNet( pretrained_model_name=pretrained_model, fc_size=config.fc_size, drop_rate=config.drop_rate, drop_connect_rate=config.drop_connect_rate, weight_init='goog', ).to(DEVICE) if config.smoothing is not None: if config.weighted_loss_weights is not None: indices = get_minority_target_index(train_targets, threshold=config.weighted_loss_threshold) indices = [int(i not in indices) for i, c in enumerate(target_columns)] train_loss_function = SmoothBCEwLogits( smoothing=config.smoothing, weight=config.weighted_loss_weights, weight_targets=indices, n_labels=len(target_columns)) else: train_loss_function = SmoothBCEwLogits(smoothing=config.smoothing) else: train_loss_function = bce_loss eval_loss_function = bce_loss optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate) if config.scheduler_type == 'ca': scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.t_max, eta_min=0, last_epoch=-1) elif config.scheduler_type == 'ms': scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=config.ms_scheduler_milestones, gamma=0.1) else: scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.1, patience=config.rp_patience, eps=1e-4, verbose=True) early_stopping = EarlyStopping(patience=7) best_score = np.inf start_time = time.time() for epoch in range(config.n_epoch): if config.swap_enable: dataset = MoAImageSwapDataset( X_train, y_train, transformer, image_size=config.image_size, swap_prob=config.swap_prob, swap_portion=config.swap_portion) else: dataset = MoAImageDataset(X_train, y_train, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=False) loss = loop_train(model, train_loss_function, dataloader, optimizer) if config.scheduler_type == 'rp': scheduler.step(loss) else: scheduler.step() for param_group in optimizer.param_groups: print('current learning rate:', param_group['lr']) del dataset, dataloader dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader) del dataset, dataloader logger.update({'fold': fold_index, 'epoch': epoch + 1, 'train_loss': loss, 'val_loss': valid_loss}) print(f'epoch {epoch + 1}/{config.n_epoch} - train_loss: {loss:.5f} - ' + f'valid_loss: {valid_loss:.5f} - elapsed: {time_format(time.time() - start_time)}', flush=True) if valid_loss < best_score: best_score = valid_loss torch.save(model.state_dict(), f'./{model_dir}/deepinsight-{fold_index}.pt') if early_stopping.should_stop(valid_loss): print('Early stopping', flush=True) break print(f'Done -> Fold {fold_index}/{config.kfolds} - best_valid_loss: {best_score:.5f} - ' + f'elapsed: {time_format(time.time() - start_time)}', flush=True) torch.cuda.empty_cache() gc.collect() if args.return_first_fold: logger.save(f'{model_dir}/log.csv') return test_preds = np.zeros((test_features.shape[0], len(target_columns))) start_time = time.time() print('Start infarence', flush=True) oof_preds = np.zeros((len(train_features), len(target_columns))) eval_loss_function = bce_loss for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])): print(f'Infarence Fold: {fold_index}', train_index.shape, val_index.shape, flush=True) X_valid = train_features.loc[val_index, all_features_columns].copy().values y_valid = train_targets.iloc[val_index, 1:].copy().values X_test = test_features[all_features_columns].values if config.normalizer == 'log': scaler = load_pickle(f'{model_dir}/scaler-{fold_index}.pkl') X_valid = scaler.transform(X_valid) X_test = scaler.transform(X_test) transformer = load_pickle(f'{model_dir}/transformer-{fold_index}.pkl') model = MoAEfficientNet( pretrained_model_name=pretrained_model, fc_size=config.fc_size, drop_rate=config.drop_rate, drop_connect_rate=config.drop_connect_rate, weight_init='goog', ).to(DEVICE) model.load_state_dict(torch.load(f'./{model_dir}/deepinsight-{fold_index}.pt')) dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader) print(f'Fold {fold_index}/{config.kfolds} - fold_valid_loss: {valid_loss:.5f}', flush=True) logger.update({'fold': fold_index, 'val_loss': valid_loss}) oof_preds[val_index, :] = valid_preds dataset = TestDataset(X_test, None, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) preds = loop_preds(model, dataloader) test_preds += preds / config.kfolds oof_preds_df = train_targets.copy() oof_preds_df.loc[:, target_columns] = oof_preds.clip(0, 1) oof_preds_df.to_csv(f'{model_dir}/oof_preds.csv', index=False) oof_loss = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds) print(f'OOF Validation Loss: {oof_loss:.6f}', flush=True) print(f'Done infarence Elapsed {time_format(time.time() - start_time)}', flush=True) logger.update({'fold': 'oof', 'val_loss': oof_loss}) logger.save(f'{model_dir}/log.csv') submission = pd.DataFrame(data=test_features['sig_id'].values, columns=['sig_id']) submission = submission.reindex(columns=['sig_id'] + target_columns) submission.loc[:, target_columns] = test_preds.clip(0, 1) submission.loc[test_features['cp_type'] == 0, submission.columns[1:]] = 0 submission.to_csv(f'{model_dir}/submission.csv', index=False)
def run(try_num, config): output_dir = f'./dae-out-{try_num}' if not os.path.exists(output_dir): os.mkdir(output_dir) args = get_args() train_features = pd.read_csv('../input/lish-moa/train_features.csv') test_features = pd.read_csv('../input/lish-moa/test_features.csv') if args.debug: train_features = train_features.loc[:500] config.update(dict(n_epochs=3, n_folds=2)) all_features = pd.concat([train_features, test_features]).reset_index(drop=True) g_features_columns = [ col for col in all_features.columns if col.startswith('g-') ] c_features_columns = [ col for col in all_features.columns if col.startswith('c-') ] feature_columns = g_features_columns + c_features_columns n_features = len(feature_columns) kfold = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=42, shuffle=True) logger = Logger() for fold_index, (train_idx, valid_idx) in enumerate( kfold.split(all_features.values, all_features.values)): print('Fold: ', fold_index + 1, flush=True) x_train = all_features.loc[train_idx] x_valid = all_features.loc[valid_idx] model = new_autoencoder(config.model_kind, n_features=n_features).to(DEVICE) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True) early_stopping = EarlyStopping(patience=10) best_score = np.inf for epoch in range(config.n_epochs): dataset = DaeDataset(x_train, feature_columns, noise_ratio=config.noise_ratio) dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True) train_loss = loop_train(model, criterion, dataloader, optimizer) dataset = DaeDataset(x_valid, feature_columns, noise_ratio=config.noise_ratio) dataloader = DataLoader(dataset, batch_size=config.valid_batch_size, shuffle=False) valid_loss, _ = loop_valid(model, criterion, dataloader) scheduler.step(valid_loss) logger.update({ 'fold': fold_index, 'epoch': epoch + 1, 'train_loss': train_loss, 'val_loss': valid_loss }) print( f'epoch {epoch + 1}/{config.n_epochs} - train_loss: {train_loss:.5f} - ' + f'valid_loss: {valid_loss:.5f}', flush=True) if valid_loss < best_score: best_score = valid_loss torch.save(model.state_dict(), f'./{output_dir}/dae_fold_weight_{fold_index}.pt') if early_stopping.should_stop(valid_loss): print('Early stopping', flush=True) break logger.save(f'./{output_dir}/dae_log.csv') oof_preds = [] for fold_index in range(config.n_folds): model = new_autoencoder(config.model_kind, n_features=n_features).to(DEVICE) model.load_state_dict( torch.load(f'./{output_dir}/dae_fold_weight_{fold_index}.pt')) model.eval() dataset = DaeDataset(all_features, feature_columns, noise_ratio=config.noise_ratio) dataloader = DataLoader(dataset, batch_size=config.valid_batch_size, shuffle=False) loss, preds = loop_valid(model, nn.MSELoss(), dataloader) logger.update({'fold': fold_index, 'val_loss': loss}) print('Evaluation fold: {} - valid_loss: {:.5f}'.format( fold_index, loss), flush=True) oof_preds.append(preds) print('A Whole Evaluation Score: {:.5f}'.format( mean_squared_error(all_features.loc[:, feature_columns].values, np.mean(oof_preds, axis=0))), flush=True) # for i, preds in enumerate(oof_preds): # create_pred_feature_df(preds, all_features).to_csv(f'./{output_dir}/dae_features_{i}.csv', index=False) create_pred_feature_df(np.mean(oof_preds, axis=0), all_features).to_csv( f'./{output_dir}/dae_features_mean.csv', index=False)
def run(try_num, config): logger = Logger() args = get_args() print('config:', config.to_dict(), flush=True) print('args:', args, flush=True) os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' model_dir = f'blending-01-nn-{try_num}' if not os.path.exists(model_dir): os.mkdir(model_dir) train_features = pd.read_csv('../input/lish-moa/train_features.csv') train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv') dae_features = pd.read_csv(config.dae_path) test_features = pd.read_csv('../input/lish-moa/test_features.csv') if args.debug: train_features = train_features[:500] train_targets = train_targets[:500] dae_features = pd.concat( [dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True) config.update( dict( n_folds=3, seeds=[222], n_epochs=3, batch_size=128, )) target_columns = [col for col in train_targets.columns if col != 'sig_id'] n_targets = len(target_columns) train_features, train_targets, test_features = preprocess( config, model_dir, train_features, train_targets, test_features, dae_features) features_columns = [ col for col in train_features.columns if col not in [ 'sig_id', 'cp_type', 'cp_time', 'cp_dose', 'cp_type_ctl_vehicle', 'cp_type_trt_cp' ] ] metric_loss_function = nn.BCELoss() if config.weighted_loss_strategy == 1: indices = get_minority_target_index( train_targets, threshold=config.weighted_loss_threshold) indices = [int(i not in indices) for i, c in enumerate(target_columns)] smooth_loss_function = SmoothBCELoss( smoothing=config.smoothing, weight=config.weighted_loss_weights, weight_targets=indices, n_labels=n_targets) else: smooth_loss_function = SmoothBCELoss(smoothing=config.smoothing) kfold = MultilabelStratifiedKFold(n_splits=config.n_folds, random_state=42, shuffle=True) for seed_index, seed in enumerate(config.seeds): if args.only_pred: print('Skip training', flush=True) break print(f'Train seed {seed}', flush=True) set_seed(seed) for fold_index, (train_indices, val_indices) in enumerate( kfold.split(train_targets[target_columns].values, train_targets[target_columns].values)): print(f'Train fold {fold_index + 1}', flush=True) x_train = train_features.loc[train_indices, features_columns] y_train = train_targets.loc[train_indices, target_columns] x_val = train_features.loc[val_indices, features_columns] y_val = train_targets.loc[val_indices, target_columns] model = new_model(config.model_kind, len(features_columns)).to(DEVICE) checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt' optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True) best_loss = np.inf for epoch in range(config.n_epochs): dataset = MoaDataset(x_train.values, y_train.values) dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True, drop_last=True) train_loss = loop_train(model, dataloader, optimizer, loss_functions=( smooth_loss_function, metric_loss_function, )) dataset = MoaDataset(x_val.values, y_val.values) dataloader = DataLoader(dataset, batch_size=config.val_batch_size, shuffle=False) valid_loss, _ = loop_valid(model, dataloader, metric_loss_function) print( 'Epoch {}/{} - loss: {:5.5f} - val_loss: {:5.5f}'. format(epoch + 1, config.n_epochs, train_loss, valid_loss), flush=True) logger.update({ 'epoch': epoch + 1, 'loss': train_loss, 'val_loss': valid_loss }) scheduler.step(valid_loss) if valid_loss < best_loss: best_loss = valid_loss torch.save(model.state_dict(), checkpoint_path) oof_preds = np.zeros((len(train_features), len(config.seeds), n_targets)) test_preds = np.zeros((len(test_features), n_targets)) for seed_index in range(len(config.seeds)): seed = config.seeds[seed_index] print(f'Inference for seed {seed}', flush=True) _test_preds_in_seed = np.zeros((len(test_features), n_targets)) for fold_index, (_, valid_indices) in enumerate( kfold.split(train_targets[target_columns].values, train_targets[target_columns].values)): x_val = train_features.loc[valid_indices, features_columns] y_val = train_targets.loc[valid_indices, target_columns] checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt' model = new_model(config.model_kind, len(features_columns)).to(DEVICE) model.load_state_dict(torch.load(checkpoint_path)) dataset = MoaDataset(x_val.values, y_val.values) dataloader = DataLoader(dataset, batch_size=config.val_batch_size, shuffle=False) preds = loop_pred(model, dataloader) oof_preds[valid_indices, seed_index, :] = preds dataset = MoaDataset(test_features[features_columns].values, None) dataloader = DataLoader(dataset, batch_size=config.val_batch_size, shuffle=False) preds = loop_pred(model, dataloader) _test_preds_in_seed += preds / config.n_folds score = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds[:, seed_index, :], n_targets=n_targets) test_preds += _test_preds_in_seed / len(config.seeds) print(f'Score for this seed {score:5.5f}', flush=True) logger.update({'val_loss': score}) # Evalucate validation score oof_preds = np.mean(oof_preds, axis=1) score = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds, n_targets=n_targets) print(f'Overall score is {score:5.5f}', flush=True) # Save validation prediction oof_pred_df = train_targets.copy() oof_pred_df.iloc[:, 1:] = oof_preds oof_pred_df.to_csv(f'{model_dir}/oof_pred.csv', index=False) # Save log logger.update({'val_loss': score}) logger.save(f'{model_dir}/log.csv') # Save Test Prediction test_features = pd.read_csv('../input/lish-moa/test_features.csv') submission = create_submission(test_features, ['sig_id'] + target_columns) submission[target_columns] = test_preds submission.loc[test_features['cp_type'] == 'ctl_vehicle', target_columns] = 0 submission.to_csv(f'{model_dir}/submission.csv', index=False)