def export_cube(): cube = load_pickle(RegionPairTiming.cube_filename) README = """\ d_mu: mu(r2)-mu(r1) for every gene and region pair. Dimensions: <n-genes> X <n-regions> X <n-regions> combined_std: The combined standard deviation of the two change distributions. std = sqrt(0.5*(std1^2 + std2^2)) Dimensions: <n-genes> X <n-regions> X <n-regions> score: The d' for the two change distributions. Equal to d_mu ./ combined_std. Dimensions: <n-genes> X <n-regions> X <n-regions> genes: Gene names for the genes represented in other arrays regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CUBE = README, genes = list_of_strings_to_matlab_cell_array(cube.genes), regions = list_of_strings_to_matlab_cell_array(cube.regions), age_scaler = scalers.unify(cube.age_scaler).cache_name(), d_mu = cube.d_mu, combined_std = cube.std, scores = cube.d_mu / cube.std, ) save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
def export_cube(): cube = load_pickle(RegionPairTiming.cube_filename) README = """\ d_mu: mu(r2)-mu(r1) for every gene and region pair. Dimensions: <n-genes> X <n-regions> X <n-regions> combined_std: The combined standard deviation of the two change distributions. std = sqrt(0.5*(std1^2 + std2^2)) Dimensions: <n-genes> X <n-regions> X <n-regions> score: The d' for the two change distributions. Equal to d_mu ./ combined_std. Dimensions: <n-genes> X <n-regions> X <n-regions> genes: Gene names for the genes represented in other arrays regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CUBE=README, genes=list_of_strings_to_matlab_cell_array(cube.genes), regions=list_of_strings_to_matlab_cell_array(cube.regions), age_scaler=scalers.unify(cube.age_scaler).cache_name(), d_mu=cube.d_mu, combined_std=cube.std, scores=cube.d_mu / cube.std, ) save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
def export_pathways(): change_dist = load_pickle(SingleRegion.change_dist_filename) matlab_g2i = {g:(i+1) for i,g in enumerate(change_dist.genes)} # NOTE that matlab is one based pathways = pathway_lists.read_all_pathways() pathway_names = pathways.keys() # make sure the order stays fixed pathway_genes_names = np.array([list_of_strings_to_matlab_cell_array(pathways[p]) for p in pathway_names], dtype=object) pathway_genes_idx = np.array([np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names], dtype=object) matlab_p2i = {p:(i+1) for i,p in enumerate(pathway_names)} # NOTE matlab indexing is one based list_names = pathway_lists.all_pathway_lists() list_pathway_names = np.empty(len(list_names), dtype=object) list_pathway_idx = np.empty(len(list_names), dtype=object) for i,listname in enumerate(list_names): pathways_in_list = pathway_lists.list_to_pathway_names(listname) list_pathway_names[i] = list_of_strings_to_matlab_cell_array(pathways_in_list) list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list] README = """\ pathway_names: Cell array of all pathway names. The name in cell number k is the name of the pathway at position k in "pathway_genes_names" and "pathway_genes_idx". pathway_genes_names: Cell array (size <n-pathways>). Each cell contains a cell array of strings which are the gene symbols of the genes in that pathway. pathway_genes_idx: Same as pathway_genes_names, but each cell in the outer cell array is now an array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat. Hopefully this should be easier to use in matlab. list_names: Names of pathway lists prepared by Noa list_pathway_names: Call array. One item per list. Each item is a cell array of strings which are the names of the pathways belonging to that list. list_pathway_idx: Same as list_pathway_names, but instead of listing the pathways by name, they are given as indices into the previous pathway_xxx structures. """ mdict = dict( README_PATHWAYS = README, pathway_names = list_of_strings_to_matlab_cell_array(pathway_names), pathway_genes_names = pathway_genes_names, pathway_genes_idx = pathway_genes_idx, list_names = list_of_strings_to_matlab_cell_array(list_names), list_pathway_names = list_pathway_names, list_pathway_idx = list_pathway_idx, ) save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
def __init__(self, listname='all'): self.listname = listname self.pathways = pathway_lists.read_all_pathways(listname) self.change_dist = load_pickle(SingleRegion.change_dist_filename, 'change distribution for all genes and regions') self.genes = self.change_dist.genes self.regions = self.change_dist.regions self.g2i = {g:i for i,g in enumerate(self.genes)} self.r2i = {r:i for i,r in enumerate(self.regions)} self.age_scaler = self.change_dist.age_scaler self.mu = self.change_dist.mu self.std = self.change_dist.std self.bin_edges = self.change_dist.bin_edges self.bin_centers = self.change_dist.bin_centers self.weights = self.change_dist.weights
def export_singles(): change_dist = load_pickle(SingleRegion.change_dist_filename) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS=README, genes=list_of_strings_to_matlab_cell_array(change_dist.genes), regions=list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler=scalers.unify(change_dist.age_scaler).cache_name(), mu=change_dist.mu, std=change_dist.std, bin_edges=change_dist.bin_edges, bin_centers=change_dist.bin_centers, weights=change_dist.weights, ) save_matfile(mdict, join(results_dir(), 'export', 'change-distributions.mat'))
def __init__(self, listname='all'): self.listname = listname self.pathways = pathway_lists.read_all_pathways(listname) self.change_dist = load_pickle( SingleRegion.change_dist_filename, 'change distribution for all genes and regions') self.genes = self.change_dist.genes self.regions = self.change_dist.regions self.g2i = {g: i for i, g in enumerate(self.genes)} self.r2i = {r: i for i, r in enumerate(self.regions)} self.age_scaler = self.change_dist.age_scaler self.mu = self.change_dist.mu self.std = self.change_dist.std self.bin_edges = self.change_dist.bin_edges self.bin_centers = self.change_dist.bin_centers self.weights = self.change_dist.weights
def export_singles(): change_dist = load_pickle(SingleRegion.change_dist_filename) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS = README, genes = list_of_strings_to_matlab_cell_array(change_dist.genes), regions = list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler = scalers.unify(change_dist.age_scaler).cache_name(), mu = change_dist.mu, std = change_dist.std, bin_edges = change_dist.bin_edges, bin_centers = change_dist.bin_centers, weights = change_dist.weights, ) save_matfile(mdict, join(results_dir(), 'export', 'change-distributions.mat'))
def __init__(self, listname='all'): self.listname = listname self.single = SingleRegion(listname) self.pathways = self.single.pathways self.genes = self.single.genes self.regions = self.single.regions self.g2i = self.single.g2i self.r2i = self.single.r2i self.age_scaler = self.single.age_scaler self.mu = self.single.mu self.single_std = self.single.std cube = load_pickle(RegionPairTiming.cube_filename, name='timing d-prime info for all genes and region pairs') self.d_mu = cube.d_mu self.pair_std = cube.std self.scores = self.d_mu / self.pair_std self.baseline = self.baseline_distribution_all_pairs(100, 10000)
def __init__(self, listname='all'): self.listname = listname self.single = SingleRegion(listname) self.pathways = self.single.pathways self.genes = self.single.genes self.regions = self.single.regions self.g2i = self.single.g2i self.r2i = self.single.r2i self.age_scaler = self.single.age_scaler self.mu = self.single.mu self.single_std = self.single.std cube = load_pickle( RegionPairTiming.cube_filename, name='timing d-prime info for all genes and region pairs') self.d_mu = cube.d_mu self.pair_std = cube.std self.scores = self.d_mu / self.pair_std self.baseline = self.baseline_distribution_all_pairs(100, 10000)
def export_pathways(): change_dist = load_pickle(SingleRegion.change_dist_filename) matlab_g2i = {g: (i + 1) for i, g in enumerate(change_dist.genes) } # NOTE that matlab is one based pathways = pathway_lists.read_all_pathways() pathway_names = pathways.keys() # make sure the order stays fixed pathway_genes_names = np.array([ list_of_strings_to_matlab_cell_array(pathways[p]) for p in pathway_names ], dtype=object) pathway_genes_idx = np.array([ np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names ], dtype=object) matlab_p2i = {p: (i + 1) for i, p in enumerate(pathway_names) } # NOTE matlab indexing is one based list_names = pathway_lists.all_pathway_lists() list_pathway_names = np.empty(len(list_names), dtype=object) list_pathway_idx = np.empty(len(list_names), dtype=object) for i, listname in enumerate(list_names): pathways_in_list = pathway_lists.list_to_pathway_names(listname) list_pathway_names[i] = list_of_strings_to_matlab_cell_array( pathways_in_list) list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list] README = """\ pathway_names: Cell array of all pathway names. The name in cell number k is the name of the pathway at position k in "pathway_genes_names" and "pathway_genes_idx". pathway_genes_names: Cell array (size <n-pathways>). Each cell contains a cell array of strings which are the gene symbols of the genes in that pathway. pathway_genes_idx: Same as pathway_genes_names, but each cell in the outer cell array is now an array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat. Hopefully this should be easier to use in matlab. list_names: Names of pathway lists prepared by Noa list_pathway_names: Call array. One item per list. Each item is a cell array of strings which are the names of the pathways belonging to that list. list_pathway_idx: Same as list_pathway_names, but instead of listing the pathways by name, they are given as indices into the previous pathway_xxx structures. """ mdict = dict( README_PATHWAYS=README, pathway_names=list_of_strings_to_matlab_cell_array(pathway_names), pathway_genes_names=pathway_genes_names, pathway_genes_idx=pathway_genes_idx, list_names=list_of_strings_to_matlab_cell_array(list_names), list_pathway_names=list_pathway_names, list_pathway_idx=list_pathway_idx, ) save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
def preprocess(config, model_dir, train_features, train_targets, test_features, dae_features): N_ORIGINAL_FEATURES = 872 g_features_columns = [col for col in train_features.columns if col.startswith('g-')] c_features_columns = [col for col in train_features.columns if col.startswith('c-')] # Assign DAE features if config.dae_strategy == 'replace': train_features, test_features = assign_dae_features( train_features, test_features, dae_features, N_ORIGINAL_FEATURES) else: train_features, test_features, _ = merge_dae_features( train_features, test_features, dae_features, len(g_features_columns), len(c_features_columns)) # Drop ctl_vehicle train_targets = train_targets.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) train_features = train_features.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) # Categorical encoding train_features, test_features, onehot_feature_columns = encode_categorical_features(train_features, test_features) # Normalize nomalizing_columns = g_features_columns + c_features_columns + onehot_feature_columns train_features, test_features = normalize(train_features, test_features, nomalizing_columns, norm_fun=config.norm_fun, concat_mode=config.norm_concat_mode, n_quantiles=config.gauss_n_quantiles) # Grouping features feature_groups = [g_features_columns, c_features_columns] # Add stats as futures train_features, test_features, _ = add_stats(train_features, test_features, feature_groups, concat_mode=config.stat_concat_mode) train_features, test_features, _ = c_squared(train_features, test_features, c_features_columns, square_nums=config.square_nums, concat_mode=config.sqrt_concat_mode) # PCA feature_names_pca = [] if config.skip_pca is False: train_features, test_features, feature_names_pca = apply_pca(train_features, test_features, feature_groups=feature_groups, n_comp_ratio=config.pca_n_comp_ratio, concat_mode=config.pca_concat_mode) print( f'(PCA) Adding {len(feature_names_pca)} features ' + f'and having a total of {len(train_features.columns)} features.', flush=True ) print('(PCA) train:', train_features.shape, flush=True) print('(PCA) test:', test_features.shape, flush=True) # Variance encoding variance_target_features = list(train_features.iloc[:, 4:].columns) pickle_path = f'{model_dir}/variance_encoder.pkl' if not os.path.exists(pickle_path): vt = variance_reduction_fit(train_features, variance_target_features, config.variance_threshold) save_pickle(vt, pickle_path) vt = load_pickle(pickle_path) train_features = variance_reduction_transform(vt, train_features, variance_target_features) test_features = variance_reduction_transform(vt, test_features, variance_target_features) print('(variance_reduction) Number of features after applying:', len(train_features.columns), flush=True) return train_features, train_targets, test_features
def run(try_num, config): args = get_args() print('args', args, flush=True) print('config:', config.to_dict(), flush=True) set_seed(config.rand_seed) pretrained_model = f"tf_efficientnet_b3_ns" model_dir = f'deepinsight-{try_num}' if not os.path.exists(model_dir): os.mkdir(model_dir) train_features = pd.read_csv(f"../input/lish-moa/train_features.csv") train_targets = pd.read_csv(f"../input/lish-moa/train_targets_scored.csv") test_features = pd.read_csv(f"../input/lish-moa/test_features.csv") if config.dae_path: dae_features = pd.read_csv(config.dae_path) if args.debug: train_features = train_features.iloc[:500] train_targets = train_targets.iloc[:500] if config.dae_path: dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True) config.update(dict( kfolds=3, n_epoch=3 )) train_features = train_features.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True) train_targets = train_targets.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True) cat_features_columns = ["cp_dose", 'cp_time'] num_feature_columns = [c for c in train_features.columns if c != "sig_id" and c not in cat_features_columns + ['cp_type']] all_features_columns = cat_features_columns + num_feature_columns target_columns = [c for c in train_targets.columns if c != "sig_id"] g_feature_columns = [c for c in num_feature_columns if c.startswith("g-")] c_feature_columns = [c for c in num_feature_columns if c.startswith("c-")] if config.dae_path: if config.dae_strategy == 'replace': train_features, test_features = assign_dae_features( train_features, test_features, dae_features, len(num_feature_columns)) else: train_features, test_features, dae_feature_columns = merge_dae_features( train_features, test_features, dae_features, len(g_feature_columns), len(c_feature_columns)) all_features_columns += dae_feature_columns train_targets = train_targets.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) train_features = train_features.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True) if config.normalizer == 'rank': train_features, test_features = normalize(train_features, test_features, num_feature_columns) for df in [train_features, test_features]: df['cp_type'] = df['cp_type'].map({'ctl_vehicle': 0, 'trt_cp': 1}) df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1}) df['cp_time'] = df['cp_time'].map({24: 0, 48: 0.5, 72: 1}) if config.variance_target_type == 1: pickle_path = f'{model_dir}/variance_reduction.pkl' variance_target_features = num_feature_columns if config.dae_path and config.dae_strategy != 'replace': variance_target_features += dae_feature_columns if not os.path.exists(pickle_path): vt = variance_reduction_fit(train_features, variance_target_features, config.variance_threshold) save_pickle(vt, pickle_path) vt = load_pickle(pickle_path) train_features = variance_reduction_transform(vt, train_features, variance_target_features) test_features = variance_reduction_transform(vt, test_features, variance_target_features) print('(variance_reduction) Number of features after applying:', len(train_features.columns), flush=True) all_features_columns = list(train_features.columns[1:]) skf = MultilabelStratifiedKFold(n_splits=config.kfolds, shuffle=True, random_state=config.rand_seed) y_labels = np.sum(train_targets.drop("sig_id", axis=1), axis=0).index.tolist() logger = Logger() for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])): if args.only_pred: print('Skip training', flush=True) break print(f'Fold: {fold_index}', train_index.shape, val_index.shape, flush=True) X_train = train_features.loc[train_index, all_features_columns].copy().values y_train = train_targets.iloc[train_index, 1:].copy().values X_valid = train_features.loc[val_index, all_features_columns].copy().values y_valid = train_targets.iloc[val_index, 1:].copy().values if config.normalizer == 'log': scaler = LogScaler() if config.norm_apply_all: scaler.fit(X_train) X_train = scaler.transform(X_train) X_valid = scaler.transform(X_valid) else: target_features = [i for i, c in enumerate(all_features_columns) if c in num_feature_columns] non_target_features = [i for i, c in enumerate(all_features_columns) if c not in num_feature_columns] scaler.fit(X_train[:, target_features]) X_train_tr = scaler.transform(X_train[:, target_features]) X_valid_tr = scaler.transform(X_valid[:, target_features]) X_train = np.concatenate([X_train[:, non_target_features], X_train_tr], axis=1) X_valid = np.concatenate([X_valid[:, non_target_features], X_valid_tr], axis=1) save_pickle(scaler, f'{model_dir}/scaler-{fold_index}.pkl') transformer = DeepInsightTransformer( feature_extractor=config.extractor, pixels=config.resolution, perplexity=config.perplexity, random_state=config.rand_seed, n_jobs=-1 ).fit(X_train) save_pickle(transformer, f'{model_dir}/transformer-{fold_index}.pkl') model = MoAEfficientNet( pretrained_model_name=pretrained_model, fc_size=config.fc_size, drop_rate=config.drop_rate, drop_connect_rate=config.drop_connect_rate, weight_init='goog', ).to(DEVICE) if config.smoothing is not None: if config.weighted_loss_weights is not None: indices = get_minority_target_index(train_targets, threshold=config.weighted_loss_threshold) indices = [int(i not in indices) for i, c in enumerate(target_columns)] train_loss_function = SmoothBCEwLogits( smoothing=config.smoothing, weight=config.weighted_loss_weights, weight_targets=indices, n_labels=len(target_columns)) else: train_loss_function = SmoothBCEwLogits(smoothing=config.smoothing) else: train_loss_function = bce_loss eval_loss_function = bce_loss optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate) if config.scheduler_type == 'ca': scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.t_max, eta_min=0, last_epoch=-1) elif config.scheduler_type == 'ms': scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=config.ms_scheduler_milestones, gamma=0.1) else: scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.1, patience=config.rp_patience, eps=1e-4, verbose=True) early_stopping = EarlyStopping(patience=7) best_score = np.inf start_time = time.time() for epoch in range(config.n_epoch): if config.swap_enable: dataset = MoAImageSwapDataset( X_train, y_train, transformer, image_size=config.image_size, swap_prob=config.swap_prob, swap_portion=config.swap_portion) else: dataset = MoAImageDataset(X_train, y_train, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=False) loss = loop_train(model, train_loss_function, dataloader, optimizer) if config.scheduler_type == 'rp': scheduler.step(loss) else: scheduler.step() for param_group in optimizer.param_groups: print('current learning rate:', param_group['lr']) del dataset, dataloader dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader) del dataset, dataloader logger.update({'fold': fold_index, 'epoch': epoch + 1, 'train_loss': loss, 'val_loss': valid_loss}) print(f'epoch {epoch + 1}/{config.n_epoch} - train_loss: {loss:.5f} - ' + f'valid_loss: {valid_loss:.5f} - elapsed: {time_format(time.time() - start_time)}', flush=True) if valid_loss < best_score: best_score = valid_loss torch.save(model.state_dict(), f'./{model_dir}/deepinsight-{fold_index}.pt') if early_stopping.should_stop(valid_loss): print('Early stopping', flush=True) break print(f'Done -> Fold {fold_index}/{config.kfolds} - best_valid_loss: {best_score:.5f} - ' + f'elapsed: {time_format(time.time() - start_time)}', flush=True) torch.cuda.empty_cache() gc.collect() if args.return_first_fold: logger.save(f'{model_dir}/log.csv') return test_preds = np.zeros((test_features.shape[0], len(target_columns))) start_time = time.time() print('Start infarence', flush=True) oof_preds = np.zeros((len(train_features), len(target_columns))) eval_loss_function = bce_loss for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])): print(f'Infarence Fold: {fold_index}', train_index.shape, val_index.shape, flush=True) X_valid = train_features.loc[val_index, all_features_columns].copy().values y_valid = train_targets.iloc[val_index, 1:].copy().values X_test = test_features[all_features_columns].values if config.normalizer == 'log': scaler = load_pickle(f'{model_dir}/scaler-{fold_index}.pkl') X_valid = scaler.transform(X_valid) X_test = scaler.transform(X_test) transformer = load_pickle(f'{model_dir}/transformer-{fold_index}.pkl') model = MoAEfficientNet( pretrained_model_name=pretrained_model, fc_size=config.fc_size, drop_rate=config.drop_rate, drop_connect_rate=config.drop_connect_rate, weight_init='goog', ).to(DEVICE) model.load_state_dict(torch.load(f'./{model_dir}/deepinsight-{fold_index}.pt')) dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader) print(f'Fold {fold_index}/{config.kfolds} - fold_valid_loss: {valid_loss:.5f}', flush=True) logger.update({'fold': fold_index, 'val_loss': valid_loss}) oof_preds[val_index, :] = valid_preds dataset = TestDataset(X_test, None, transformer, image_size=config.image_size) dataloader = DataLoader( dataset, batch_size=config.infer_batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) preds = loop_preds(model, dataloader) test_preds += preds / config.kfolds oof_preds_df = train_targets.copy() oof_preds_df.loc[:, target_columns] = oof_preds.clip(0, 1) oof_preds_df.to_csv(f'{model_dir}/oof_preds.csv', index=False) oof_loss = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds) print(f'OOF Validation Loss: {oof_loss:.6f}', flush=True) print(f'Done infarence Elapsed {time_format(time.time() - start_time)}', flush=True) logger.update({'fold': 'oof', 'val_loss': oof_loss}) logger.save(f'{model_dir}/log.csv') submission = pd.DataFrame(data=test_features['sig_id'].values, columns=['sig_id']) submission = submission.reindex(columns=['sig_id'] + target_columns) submission.loc[:, target_columns] = test_preds.clip(0, 1) submission.loc[test_features['cp_type'] == 0, submission.columns[1:]] = 0 submission.to_csv(f'{model_dir}/submission.csv', index=False)
def main(): # Import settings parser = argparse.ArgumentParser() parser.add_argument('--uni_flag', type=int, default=1, help='unit tst flg') parser.add_argument('--dat_path', type=str, default=None, help='path to data directory') parser.add_argument('--img_name', type=str, default=None, help='name of prediction file containing images') parser.add_argument('--nme_name', type=str, default=None, help='name of name file corresponding to img preds') parser.add_argument('--sub_name', type=str, default=None, help='name of submission file') parser.add_argument('--thres', type=float, default=0.5, help='activation thresholding to transform SM vals') args = parser.parse_args() # Define some variables relative to parser inputs data_path = args.dat_path imgs_path = data_path + args.img_name name_path = data_path + args.nme_name subm_path = data_path + args.sub_name uni_flag = bool(args.uni_flag) # Load data if uni_flag: # Unit test names = ['sample_1', 'sample_2', 'sample_3', 'sample_4'] sample_1 = np.array([[0, 1, 1, 0], [0, 0, 1, 0], [1, 1, 1, 1], [0, 0, 0, 1]]) sample_2 = np.array([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sample_3 = np.array([[1, 1, 1, 1], [1, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0]]) sample_4 = np.array([[1, 1, 1, 1], [0, 0, 1, 0], [1, 0, 0, 0], [1, 1, 1, 0]]) images = np.stack((sample_1, sample_2, sample_3, sample_4), axis=0) else: # Normal operation images = load_h5(imgs_path) names = load_pickle(name_path) # Transform data thresholded_images = np.uint8(images > args.thres) assert len(thresholded_images.shape) == 3 # Make submissions df = make_submission(thresholded_images, names, uni_flag) if uni_flag: # Unit test assert np.array_equal(df['id'].values, names) assert df.loc[0]['rle_mask'] == '3 1 5 1 7 1 9 3 15 2', 'Sample 1' assert df.loc[1]['rle_mask'] == '', 'Sample 2' assert df.loc[2]['rle_mask'] == '1 3 5 1 9 2 12 2', 'Sample 3' assert df.loc[3]['rle_mask'] == '1 1 3 3 8 3 12 2', 'Sample 4' else: df.to_csv(subm_path, index=False) return None
def main(): # Import settings (note that default debug settings are used) parser = argparse.ArgumentParser(description='TGS Challenge Main Script') parser.add_argument('--trn_path', type=str, default='./data/debug_train/', help='path to training directory (default: debug)') parser.add_argument('--msk_path', type=str, default='./data/debug_masks', help='path to mask directory (default: debug)') parser.add_argument('--tst_path', type=str, default='./data/debug_test/', help='path to test directory (default: debug)') parser.add_argument('--mod_path', type=str, default='./weights/model_tmp/', help='path to model weights directory (default: tmp)') parser.add_argument('--batch_size', type=int, default=3, help='input batch size (default: 3)') parser.add_argument('--epochs', type=int, default=10, help='number of epochs to train for (default: 10)') parser.add_argument('--starting_epoch', type=int, default=1, help='index of starting epoch (default: 1)') parser.add_argument('--lr', type=float, default=0.001, help='learning rate (default: 0.001)') parser.add_argument('--lr_patience', type=int, default=10, help='num epochs to wait for LR reduce (default: 10)') parser.add_argument('--print_every', type=int, default=1, help='num batches before printing (default: 1)') parser.add_argument('--NUM_TRAIN', type=int, default=6, help='num samples in split train set (default: 6)') parser.add_argument('--NUM_FULL', type=int, default=9, help='num samples in full train set (default: 9)') args = parser.parse_args() # Define some variables relative to parser inputs trn_path = args.trn_path msk_path = args.msk_path tst_path = args.tst_path mod_path = args.mod_path starting_epoch = args.starting_epoch NUM_TRAIN = args.NUM_TRAIN NUM_FULL = args.NUM_FULL record_name = 'best_record.pickle' history_name = 'training_history.pickle' # Validate specified model path restart_token = check_dir(mod_path) # Returns None if path exists # Define model (comment out irrelevant models as necessary) # net = ResSeg33(ResidualBlock) # net = ResSeg33_Reg(ResBlock_Reg) # net = ResSegVar(ResidualBlock, [3, 4, 6, 3]) # 45 layers net = ResSegVar(ResBlock_Reg, [6, 8, 12, 6]) # 77 layers # Loss function criterion = nn.CrossEntropyLoss() # Optimizer optimizer = optim.Adam(net.parameters(), lr=args.lr) # Define or load training history def format_epoch_fname(start_num): return mod_path + 'epoch_%s.pth' % start_num best_record = {} training_history = {} if restart_token: # Starting from scratch curr_epoch = 1 best_record['epoch'] = 0 best_record['val_loss'] = 1e10 best_record['mean_iou'] = 0 else: print 'Resuming training from epoch:', starting_epoch net.load_state_dict(torch.load(format_epoch_fname(starting_epoch))) curr_epoch = starting_epoch + 1 best_record = load_pickle(mod_path + record_name) training_history = load_pickle(mod_path + history_name) # Define device and dtype device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') dtype = torch.float32 # Parallelization init and set net to CUDA if possible if torch.cuda.is_available(): net.cuda() net = torch.nn.DataParallel(net, device_ids=range( torch.cuda.device_count())) # Load data paths = (trn_path, msk_path, tst_path) stats = (NUM_TRAIN, NUM_FULL, args.batch_size) trn_set, val_set, tst_set = data_formatter(paths, stats) # Unpack data trn_data, trn_load = trn_set val_data, val_load = val_set tst_data, tst_load = tst_set # Define automatic LR reduction scheduler scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', patience=args.lr_patience, min_lr=1e-10) # Model API parameters param_dict = { 'loader': None, 'net': net, 'criterion': criterion, 'optimizer': optimizer, 'epoch': 1, 'args': args, 'device': device, 'dtype': dtype } # Note: epoch starts from 1, not 0 for i, epoch in enumerate(range(curr_epoch, args.epochs + 1)): # Update epoch param_dict['epoch'] = epoch # Train param_dict['loader'] = trn_load trn_log = train(**param_dict) # Validate param_dict['loader'] = val_load val_loss, mean_iou = validate(**param_dict) # Update logging files training_history['epoch_%s' % (i + 1)] = trn_log # Save weights if avg_iou score improves if val_loss < best_record['val_loss']: best_record['epoch'] = epoch best_record['val_loss'] = val_loss best_record['mean_iou'] = mean_iou torch.save(net.state_dict(), format_epoch_fname(epoch)) # Print best record information print '--------------------------------------' print 'best record: [epoch %d], [val_loss %.4f], [mean_iou %.4f]' % ( best_record['epoch'], best_record['val_loss'], best_record['mean_iou']) print '--------------------------------------' print '' # Save logging information every epoch save_pickle(data=training_history, path=mod_path + history_name) save_pickle(data=best_record, path=mod_path + record_name) scheduler.step(val_loss)