def export_cube():
    cube = load_pickle(RegionPairTiming.cube_filename)
    README = """\
d_mu:
mu(r2)-mu(r1) for every gene and region pair. 
Dimensions: <n-genes> X <n-regions> X <n-regions>

combined_std: 
The combined standard deviation of the two change distributions.
std = sqrt(0.5*(std1^2 + std2^2))
Dimensions: <n-genes> X <n-regions> X <n-regions>

score:
The d' for the two change distributions. Equal to d_mu ./ combined_std.
Dimensions: <n-genes> X <n-regions> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CUBE = README,
        genes = list_of_strings_to_matlab_cell_array(cube.genes),
        regions = list_of_strings_to_matlab_cell_array(cube.regions),
        age_scaler = scalers.unify(cube.age_scaler).cache_name(),
        d_mu = cube.d_mu,
        combined_std = cube.std,
        scores = cube.d_mu / cube.std,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
Exemple #2
0
def export_cube():
    cube = load_pickle(RegionPairTiming.cube_filename)
    README = """\
d_mu:
mu(r2)-mu(r1) for every gene and region pair. 
Dimensions: <n-genes> X <n-regions> X <n-regions>

combined_std: 
The combined standard deviation of the two change distributions.
std = sqrt(0.5*(std1^2 + std2^2))
Dimensions: <n-genes> X <n-regions> X <n-regions>

score:
The d' for the two change distributions. Equal to d_mu ./ combined_std.
Dimensions: <n-genes> X <n-regions> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CUBE=README,
        genes=list_of_strings_to_matlab_cell_array(cube.genes),
        regions=list_of_strings_to_matlab_cell_array(cube.regions),
        age_scaler=scalers.unify(cube.age_scaler).cache_name(),
        d_mu=cube.d_mu,
        combined_std=cube.std,
        scores=cube.d_mu / cube.std,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
def export_pathways():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    matlab_g2i = {g:(i+1) for i,g in enumerate(change_dist.genes)} # NOTE that matlab is one based
    
    pathways = pathway_lists.read_all_pathways()
    pathway_names = pathways.keys() # make sure the order stays fixed
    pathway_genes_names = np.array([list_of_strings_to_matlab_cell_array(pathways[p]) for p in pathway_names], dtype=object)
    pathway_genes_idx = np.array([np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names], dtype=object)

    matlab_p2i = {p:(i+1) for i,p in enumerate(pathway_names)} # NOTE matlab indexing is one based
    list_names = pathway_lists.all_pathway_lists()
    list_pathway_names = np.empty(len(list_names), dtype=object)
    list_pathway_idx = np.empty(len(list_names), dtype=object)
    for i,listname in enumerate(list_names):
        pathways_in_list = pathway_lists.list_to_pathway_names(listname)
        list_pathway_names[i] = list_of_strings_to_matlab_cell_array(pathways_in_list)
        list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list]
    README = """\
pathway_names:
Cell array of all pathway names. The name in cell number k is the name of the
pathway at position k in "pathway_genes_names" and "pathway_genes_idx".

pathway_genes_names:
Cell array (size <n-pathways>). Each cell contains a cell array of strings which 
are the gene symbols of the genes in that pathway.

pathway_genes_idx:
Same as pathway_genes_names, but each cell in the outer cell array is now an 
array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat.
Hopefully this should be easier to use in matlab.

list_names:
Names of pathway lists prepared by Noa

list_pathway_names:
Call array. One item per list. Each item is a cell array of strings which are 
the names of the pathways belonging to that list.

list_pathway_idx:
Same as list_pathway_names, but instead of listing the pathways by name, they 
are given as indices into the previous pathway_xxx structures.
"""
    mdict = dict(
        README_PATHWAYS = README,
        pathway_names = list_of_strings_to_matlab_cell_array(pathway_names),
        pathway_genes_names = pathway_genes_names,
        pathway_genes_idx = pathway_genes_idx,
        list_names = list_of_strings_to_matlab_cell_array(list_names),
        list_pathway_names = list_pathway_names,
        list_pathway_idx = list_pathway_idx,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
Exemple #4
0
    def __init__(self, listname='all'):
        self.listname = listname
        self.pathways = pathway_lists.read_all_pathways(listname)

        self.change_dist = load_pickle(SingleRegion.change_dist_filename, 'change distribution for all genes and regions')
        self.genes = self.change_dist.genes
        self.regions = self.change_dist.regions
        self.g2i = {g:i for i,g in enumerate(self.genes)}
        self.r2i = {r:i for i,r in enumerate(self.regions)}
        self.age_scaler = self.change_dist.age_scaler
        self.mu = self.change_dist.mu
        self.std = self.change_dist.std
        self.bin_edges = self.change_dist.bin_edges
        self.bin_centers = self.change_dist.bin_centers
        self.weights = self.change_dist.weights
Exemple #5
0
def export_singles():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    README = """\
mu:
The mean age of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

std:
The standard deviation of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

weights:
The change distributions for each gene and region.
Dimensions: <n-genes> X <n-regions> X <n-bins>

bin_centers:
The ages for the center of each bin used in calculating the histogram in "weights".
Dimensions: <n-bins> X 1

bin_edges:
The edges of the bins used in calculating the change histogram.
(centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated)
Dimensions: <n-bins + 1> X 1

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CHANGE_DISTRIBUTIONS=README,
        genes=list_of_strings_to_matlab_cell_array(change_dist.genes),
        regions=list_of_strings_to_matlab_cell_array(change_dist.regions),
        age_scaler=scalers.unify(change_dist.age_scaler).cache_name(),
        mu=change_dist.mu,
        std=change_dist.std,
        bin_edges=change_dist.bin_edges,
        bin_centers=change_dist.bin_centers,
        weights=change_dist.weights,
    )
    save_matfile(mdict,
                 join(results_dir(), 'export', 'change-distributions.mat'))
Exemple #6
0
    def __init__(self, listname='all'):
        self.listname = listname
        self.pathways = pathway_lists.read_all_pathways(listname)

        self.change_dist = load_pickle(
            SingleRegion.change_dist_filename,
            'change distribution for all genes and regions')
        self.genes = self.change_dist.genes
        self.regions = self.change_dist.regions
        self.g2i = {g: i for i, g in enumerate(self.genes)}
        self.r2i = {r: i for i, r in enumerate(self.regions)}
        self.age_scaler = self.change_dist.age_scaler
        self.mu = self.change_dist.mu
        self.std = self.change_dist.std
        self.bin_edges = self.change_dist.bin_edges
        self.bin_centers = self.change_dist.bin_centers
        self.weights = self.change_dist.weights
def export_singles():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    README = """\
mu:
The mean age of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

std:
The standard deviation of the change distribution for given gene and region.
Dimensions: <n-genes> X <n-regions>

genes: 
Gene names for the genes represented in other arrays

weights:
The change distributions for each gene and region.
Dimensions: <n-genes> X <n-regions> X <n-bins>

bin_centers:
The ages for the center of each bin used in calculating the histogram in "weights".
Dimensions: <n-bins> X 1

bin_edges:
The edges of the bins used in calculating the change histogram.
(centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated)
Dimensions: <n-bins + 1> X 1

regions: 
Region names for the regions represented in other arrays

age_scaler: 
The scaling used for ages (i.e. 'log' means x' = log(x + 38/52))
"""
    mdict = dict(
        README_CHANGE_DISTRIBUTIONS = README,
        genes = list_of_strings_to_matlab_cell_array(change_dist.genes),
        regions = list_of_strings_to_matlab_cell_array(change_dist.regions),
        age_scaler = scalers.unify(change_dist.age_scaler).cache_name(),
        mu = change_dist.mu,
        std = change_dist.std,
        bin_edges = change_dist.bin_edges,
        bin_centers = change_dist.bin_centers,
        weights = change_dist.weights,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'change-distributions.mat'))
Exemple #8
0
    def __init__(self, listname='all'):
        self.listname = listname
        self.single = SingleRegion(listname)
        self.pathways = self.single.pathways
        self.genes = self.single.genes
        self.regions = self.single.regions
        self.g2i = self.single.g2i
        self.r2i = self.single.r2i
        self.age_scaler = self.single.age_scaler
        self.mu = self.single.mu
        self.single_std = self.single.std

        cube = load_pickle(RegionPairTiming.cube_filename, name='timing d-prime info for all genes and region pairs')
        self.d_mu = cube.d_mu
        self.pair_std = cube.std
        self.scores = self.d_mu / self.pair_std

        self.baseline = self.baseline_distribution_all_pairs(100, 10000)
Exemple #9
0
    def __init__(self, listname='all'):
        self.listname = listname
        self.single = SingleRegion(listname)
        self.pathways = self.single.pathways
        self.genes = self.single.genes
        self.regions = self.single.regions
        self.g2i = self.single.g2i
        self.r2i = self.single.r2i
        self.age_scaler = self.single.age_scaler
        self.mu = self.single.mu
        self.single_std = self.single.std

        cube = load_pickle(
            RegionPairTiming.cube_filename,
            name='timing d-prime info for all genes and region pairs')
        self.d_mu = cube.d_mu
        self.pair_std = cube.std
        self.scores = self.d_mu / self.pair_std

        self.baseline = self.baseline_distribution_all_pairs(100, 10000)
Exemple #10
0
def export_pathways():
    change_dist = load_pickle(SingleRegion.change_dist_filename)
    matlab_g2i = {g: (i + 1)
                  for i, g in enumerate(change_dist.genes)
                  }  # NOTE that matlab is one based

    pathways = pathway_lists.read_all_pathways()
    pathway_names = pathways.keys()  # make sure the order stays fixed
    pathway_genes_names = np.array([
        list_of_strings_to_matlab_cell_array(pathways[p])
        for p in pathway_names
    ],
                                   dtype=object)
    pathway_genes_idx = np.array([
        np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names
    ],
                                 dtype=object)

    matlab_p2i = {p: (i + 1)
                  for i, p in enumerate(pathway_names)
                  }  # NOTE matlab indexing is one based
    list_names = pathway_lists.all_pathway_lists()
    list_pathway_names = np.empty(len(list_names), dtype=object)
    list_pathway_idx = np.empty(len(list_names), dtype=object)
    for i, listname in enumerate(list_names):
        pathways_in_list = pathway_lists.list_to_pathway_names(listname)
        list_pathway_names[i] = list_of_strings_to_matlab_cell_array(
            pathways_in_list)
        list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list]
    README = """\
pathway_names:
Cell array of all pathway names. The name in cell number k is the name of the
pathway at position k in "pathway_genes_names" and "pathway_genes_idx".

pathway_genes_names:
Cell array (size <n-pathways>). Each cell contains a cell array of strings which 
are the gene symbols of the genes in that pathway.

pathway_genes_idx:
Same as pathway_genes_names, but each cell in the outer cell array is now an 
array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat.
Hopefully this should be easier to use in matlab.

list_names:
Names of pathway lists prepared by Noa

list_pathway_names:
Call array. One item per list. Each item is a cell array of strings which are 
the names of the pathways belonging to that list.

list_pathway_idx:
Same as list_pathway_names, but instead of listing the pathways by name, they 
are given as indices into the previous pathway_xxx structures.
"""
    mdict = dict(
        README_PATHWAYS=README,
        pathway_names=list_of_strings_to_matlab_cell_array(pathway_names),
        pathway_genes_names=pathway_genes_names,
        pathway_genes_idx=pathway_genes_idx,
        list_names=list_of_strings_to_matlab_cell_array(list_names),
        list_pathway_names=list_pathway_names,
        list_pathway_idx=list_pathway_idx,
    )
    save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
Exemple #11
0
def preprocess(config, model_dir, train_features, train_targets, test_features, dae_features):
    N_ORIGINAL_FEATURES = 872

    g_features_columns = [col for col in train_features.columns if col.startswith('g-')]
    c_features_columns = [col for col in train_features.columns if col.startswith('c-')]

    # Assign DAE features
    if config.dae_strategy == 'replace':
        train_features, test_features = assign_dae_features(
            train_features, test_features, dae_features, N_ORIGINAL_FEATURES)
    else:
        train_features, test_features, _ = merge_dae_features(
            train_features, test_features, dae_features, len(g_features_columns), len(c_features_columns))

    # Drop ctl_vehicle
    train_targets = train_targets.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True)
    train_features = train_features.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True)

    # Categorical encoding
    train_features, test_features, onehot_feature_columns = encode_categorical_features(train_features, test_features)

    # Normalize
    nomalizing_columns = g_features_columns + c_features_columns + onehot_feature_columns
    train_features, test_features = normalize(train_features, test_features, nomalizing_columns,
                                              norm_fun=config.norm_fun, concat_mode=config.norm_concat_mode,
                                              n_quantiles=config.gauss_n_quantiles)

    # Grouping features
    feature_groups = [g_features_columns, c_features_columns]

    # Add stats as futures
    train_features, test_features, _ = add_stats(train_features, test_features, feature_groups,
                                                 concat_mode=config.stat_concat_mode)

    train_features, test_features, _ = c_squared(train_features, test_features, c_features_columns,
                                                 square_nums=config.square_nums, concat_mode=config.sqrt_concat_mode)

    # PCA
    feature_names_pca = []
    if config.skip_pca is False:
        train_features, test_features, feature_names_pca = apply_pca(train_features, test_features,
                                                                     feature_groups=feature_groups,
                                                                     n_comp_ratio=config.pca_n_comp_ratio,
                                                                     concat_mode=config.pca_concat_mode)
        print(
            f'(PCA) Adding {len(feature_names_pca)} features ' +
            f'and having a total of {len(train_features.columns)} features.',
            flush=True
        )
        print('(PCA) train:', train_features.shape, flush=True)
        print('(PCA) test:', test_features.shape, flush=True)

    # Variance encoding
    variance_target_features = list(train_features.iloc[:, 4:].columns)
    pickle_path = f'{model_dir}/variance_encoder.pkl'

    if not os.path.exists(pickle_path):
        vt = variance_reduction_fit(train_features, variance_target_features, config.variance_threshold)
        save_pickle(vt, pickle_path)

    vt = load_pickle(pickle_path)
    train_features = variance_reduction_transform(vt, train_features, variance_target_features)
    test_features = variance_reduction_transform(vt, test_features, variance_target_features)
    print('(variance_reduction) Number of features after applying:', len(train_features.columns), flush=True)

    return train_features, train_targets, test_features
def run(try_num, config):
    args = get_args()

    print('args', args, flush=True)
    print('config:', config.to_dict(), flush=True)

    set_seed(config.rand_seed)

    pretrained_model = f"tf_efficientnet_b3_ns"
    model_dir = f'deepinsight-{try_num}'

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    train_features = pd.read_csv(f"../input/lish-moa/train_features.csv")
    train_targets = pd.read_csv(f"../input/lish-moa/train_targets_scored.csv")
    test_features = pd.read_csv(f"../input/lish-moa/test_features.csv")

    if config.dae_path:
        dae_features = pd.read_csv(config.dae_path)

    if args.debug:
        train_features = train_features.iloc[:500]
        train_targets = train_targets.iloc[:500]
        if config.dae_path:
            dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True)

        config.update(dict(
            kfolds=3,
            n_epoch=3
        ))

    train_features = train_features.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True)
    train_targets = train_targets.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True)

    cat_features_columns = ["cp_dose", 'cp_time']
    num_feature_columns = [c for c in train_features.columns
                           if c != "sig_id" and c not in cat_features_columns + ['cp_type']]
    all_features_columns = cat_features_columns + num_feature_columns
    target_columns = [c for c in train_targets.columns if c != "sig_id"]
    g_feature_columns = [c for c in num_feature_columns if c.startswith("g-")]
    c_feature_columns = [c for c in num_feature_columns if c.startswith("c-")]

    if config.dae_path:
        if config.dae_strategy == 'replace':
            train_features, test_features = assign_dae_features(
                train_features, test_features, dae_features, len(num_feature_columns))
        else:
            train_features, test_features, dae_feature_columns = merge_dae_features(
                train_features, test_features, dae_features, len(g_feature_columns), len(c_feature_columns))
            all_features_columns += dae_feature_columns

    train_targets = train_targets.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True)
    train_features = train_features.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True)

    if config.normalizer == 'rank':
        train_features, test_features = normalize(train_features, test_features, num_feature_columns)

    for df in [train_features, test_features]:
        df['cp_type'] = df['cp_type'].map({'ctl_vehicle': 0, 'trt_cp': 1})
        df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1})
        df['cp_time'] = df['cp_time'].map({24: 0, 48: 0.5, 72: 1})

    if config.variance_target_type == 1:
        pickle_path = f'{model_dir}/variance_reduction.pkl'

        variance_target_features = num_feature_columns
        if config.dae_path and config.dae_strategy != 'replace':
            variance_target_features += dae_feature_columns

        if not os.path.exists(pickle_path):
            vt = variance_reduction_fit(train_features, variance_target_features, config.variance_threshold)
            save_pickle(vt, pickle_path)

        vt = load_pickle(pickle_path)
        train_features = variance_reduction_transform(vt, train_features, variance_target_features)
        test_features = variance_reduction_transform(vt, test_features, variance_target_features)
        print('(variance_reduction) Number of features after applying:', len(train_features.columns), flush=True)
        all_features_columns = list(train_features.columns[1:])

    skf = MultilabelStratifiedKFold(n_splits=config.kfolds, shuffle=True, random_state=config.rand_seed)
    y_labels = np.sum(train_targets.drop("sig_id", axis=1), axis=0).index.tolist()
    logger = Logger()

    for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])):
        if args.only_pred:
            print('Skip training', flush=True)
            break

        print(f'Fold: {fold_index}', train_index.shape, val_index.shape, flush=True)

        X_train = train_features.loc[train_index, all_features_columns].copy().values
        y_train = train_targets.iloc[train_index, 1:].copy().values
        X_valid = train_features.loc[val_index, all_features_columns].copy().values
        y_valid = train_targets.iloc[val_index, 1:].copy().values

        if config.normalizer == 'log':
            scaler = LogScaler()
            if config.norm_apply_all:
                scaler.fit(X_train)
                X_train = scaler.transform(X_train)
                X_valid = scaler.transform(X_valid)
            else:
                target_features = [i for i, c in enumerate(all_features_columns) if c in num_feature_columns]
                non_target_features = [i for i, c in enumerate(all_features_columns) if c not in num_feature_columns]

                scaler.fit(X_train[:, target_features])
                X_train_tr = scaler.transform(X_train[:, target_features])
                X_valid_tr = scaler.transform(X_valid[:, target_features])
                X_train = np.concatenate([X_train[:, non_target_features], X_train_tr], axis=1)
                X_valid = np.concatenate([X_valid[:, non_target_features], X_valid_tr], axis=1)
            save_pickle(scaler, f'{model_dir}/scaler-{fold_index}.pkl')

        transformer = DeepInsightTransformer(
            feature_extractor=config.extractor,
            pixels=config.resolution,
            perplexity=config.perplexity,
            random_state=config.rand_seed,
            n_jobs=-1
        ).fit(X_train)

        save_pickle(transformer, f'{model_dir}/transformer-{fold_index}.pkl')

        model = MoAEfficientNet(
            pretrained_model_name=pretrained_model,
            fc_size=config.fc_size,
            drop_rate=config.drop_rate,
            drop_connect_rate=config.drop_connect_rate,
            weight_init='goog',
        ).to(DEVICE)

        if config.smoothing is not None:
            if config.weighted_loss_weights is not None:
                indices = get_minority_target_index(train_targets, threshold=config.weighted_loss_threshold)
                indices = [int(i not in indices) for i, c in enumerate(target_columns)]
                train_loss_function = SmoothBCEwLogits(
                    smoothing=config.smoothing,
                    weight=config.weighted_loss_weights,
                    weight_targets=indices,
                    n_labels=len(target_columns))
            else:
                train_loss_function = SmoothBCEwLogits(smoothing=config.smoothing)
        else:
            train_loss_function = bce_loss

        eval_loss_function = bce_loss

        optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate)

        if config.scheduler_type == 'ca':
            scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.t_max, eta_min=0, last_epoch=-1)
        elif config.scheduler_type == 'ms':
            scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=config.ms_scheduler_milestones, gamma=0.1)
        else:
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, mode='min', factor=0.1, patience=config.rp_patience, eps=1e-4, verbose=True)

        early_stopping = EarlyStopping(patience=7)
        best_score = np.inf
        start_time = time.time()

        for epoch in range(config.n_epoch):

            if config.swap_enable:
                dataset = MoAImageSwapDataset(
                    X_train,
                    y_train,
                    transformer,
                    image_size=config.image_size,
                    swap_prob=config.swap_prob,
                    swap_portion=config.swap_portion)
            else:
                dataset = MoAImageDataset(X_train, y_train, transformer, image_size=config.image_size)

            dataloader = DataLoader(
                dataset,
                batch_size=config.batch_size,
                shuffle=True,
                num_workers=8,
                pin_memory=True,
                drop_last=False)
            loss = loop_train(model, train_loss_function, dataloader, optimizer)

            if config.scheduler_type == 'rp':
                scheduler.step(loss)
            else:
                scheduler.step()
                for param_group in optimizer.param_groups:
                    print('current learning rate:', param_group['lr'])

            del dataset, dataloader

            dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size)
            dataloader = DataLoader(
                dataset,
                batch_size=config.infer_batch_size,
                shuffle=False,
                num_workers=8,
                pin_memory=True,
                drop_last=False)
            valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader)

            del dataset, dataloader

            logger.update({'fold': fold_index, 'epoch': epoch + 1, 'train_loss': loss, 'val_loss': valid_loss})
            print(f'epoch {epoch + 1}/{config.n_epoch}  -  train_loss: {loss:.5f}  -  ' +
                  f'valid_loss: {valid_loss:.5f}  -  elapsed: {time_format(time.time() - start_time)}', flush=True)

            if valid_loss < best_score:
                best_score = valid_loss
                torch.save(model.state_dict(), f'./{model_dir}/deepinsight-{fold_index}.pt')

            if early_stopping.should_stop(valid_loss):
                print('Early stopping', flush=True)
                break

        print(f'Done -> Fold {fold_index}/{config.kfolds}  -  best_valid_loss: {best_score:.5f}  -  ' +
              f'elapsed: {time_format(time.time() - start_time)}', flush=True)

        torch.cuda.empty_cache()
        gc.collect()

        if args.return_first_fold:
            logger.save(f'{model_dir}/log.csv')
            return

    test_preds = np.zeros((test_features.shape[0], len(target_columns)))
    start_time = time.time()
    print('Start infarence', flush=True)

    oof_preds = np.zeros((len(train_features), len(target_columns)))
    eval_loss_function = bce_loss

    for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])):
        print(f'Infarence Fold: {fold_index}', train_index.shape, val_index.shape, flush=True)
        X_valid = train_features.loc[val_index, all_features_columns].copy().values
        y_valid = train_targets.iloc[val_index, 1:].copy().values
        X_test = test_features[all_features_columns].values

        if config.normalizer == 'log':
            scaler = load_pickle(f'{model_dir}/scaler-{fold_index}.pkl')
            X_valid = scaler.transform(X_valid)
            X_test = scaler.transform(X_test)

        transformer = load_pickle(f'{model_dir}/transformer-{fold_index}.pkl')
        model = MoAEfficientNet(
            pretrained_model_name=pretrained_model,
            fc_size=config.fc_size,
            drop_rate=config.drop_rate,
            drop_connect_rate=config.drop_connect_rate,
            weight_init='goog',
        ).to(DEVICE)
        model.load_state_dict(torch.load(f'./{model_dir}/deepinsight-{fold_index}.pt'))

        dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size)
        dataloader = DataLoader(
            dataset,
            batch_size=config.infer_batch_size,
            shuffle=False,
            num_workers=8,
            pin_memory=True,
            drop_last=False)
        valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader)
        print(f'Fold {fold_index}/{config.kfolds}  -  fold_valid_loss: {valid_loss:.5f}', flush=True)
        logger.update({'fold': fold_index, 'val_loss': valid_loss})

        oof_preds[val_index, :] = valid_preds

        dataset = TestDataset(X_test, None, transformer, image_size=config.image_size)
        dataloader = DataLoader(
            dataset,
            batch_size=config.infer_batch_size,
            shuffle=False,
            num_workers=8,
            pin_memory=True,
            drop_last=False)

        preds = loop_preds(model, dataloader)
        test_preds += preds / config.kfolds

    oof_preds_df = train_targets.copy()
    oof_preds_df.loc[:, target_columns] = oof_preds.clip(0, 1)
    oof_preds_df.to_csv(f'{model_dir}/oof_preds.csv', index=False)
    oof_loss = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds)

    print(f'OOF Validation Loss: {oof_loss:.6f}', flush=True)
    print(f'Done infarence  Elapsed {time_format(time.time() - start_time)}', flush=True)
    logger.update({'fold': 'oof', 'val_loss': oof_loss})
    logger.save(f'{model_dir}/log.csv')

    submission = pd.DataFrame(data=test_features['sig_id'].values, columns=['sig_id'])
    submission = submission.reindex(columns=['sig_id'] + target_columns)
    submission.loc[:, target_columns] = test_preds.clip(0, 1)
    submission.loc[test_features['cp_type'] == 0, submission.columns[1:]] = 0
    submission.to_csv(f'{model_dir}/submission.csv', index=False)
def main():
    # Import settings
    parser = argparse.ArgumentParser()
    parser.add_argument('--uni_flag', type=int, default=1, help='unit tst flg')
    parser.add_argument('--dat_path',
                        type=str,
                        default=None,
                        help='path to data directory')
    parser.add_argument('--img_name',
                        type=str,
                        default=None,
                        help='name of prediction file containing images')
    parser.add_argument('--nme_name',
                        type=str,
                        default=None,
                        help='name of name file corresponding to img preds')
    parser.add_argument('--sub_name',
                        type=str,
                        default=None,
                        help='name of submission file')
    parser.add_argument('--thres',
                        type=float,
                        default=0.5,
                        help='activation thresholding to transform SM vals')
    args = parser.parse_args()

    # Define some variables relative to parser inputs
    data_path = args.dat_path
    imgs_path = data_path + args.img_name
    name_path = data_path + args.nme_name
    subm_path = data_path + args.sub_name

    uni_flag = bool(args.uni_flag)

    # Load data
    if uni_flag:  # Unit test
        names = ['sample_1', 'sample_2', 'sample_3', 'sample_4']
        sample_1 = np.array([[0, 1, 1, 0], [0, 0, 1, 0], [1, 1, 1, 1],
                             [0, 0, 0, 1]])
        sample_2 = np.array([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
                             [0, 0, 0, 0]])
        sample_3 = np.array([[1, 1, 1, 1], [1, 0, 1, 0], [1, 0, 0, 0],
                             [0, 0, 1, 0]])
        sample_4 = np.array([[1, 1, 1, 1], [0, 0, 1, 0], [1, 0, 0, 0],
                             [1, 1, 1, 0]])
        images = np.stack((sample_1, sample_2, sample_3, sample_4), axis=0)
    else:  # Normal operation
        images = load_h5(imgs_path)
        names = load_pickle(name_path)

    # Transform data
    thresholded_images = np.uint8(images > args.thres)
    assert len(thresholded_images.shape) == 3

    # Make submissions
    df = make_submission(thresholded_images, names, uni_flag)

    if uni_flag:  # Unit test
        assert np.array_equal(df['id'].values, names)
        assert df.loc[0]['rle_mask'] == '3 1 5 1 7 1 9 3 15 2', 'Sample 1'
        assert df.loc[1]['rle_mask'] == '', 'Sample 2'
        assert df.loc[2]['rle_mask'] == '1 3 5 1 9 2 12 2', 'Sample 3'
        assert df.loc[3]['rle_mask'] == '1 1 3 3 8 3 12 2', 'Sample 4'
    else:
        df.to_csv(subm_path, index=False)

    return None
def main():
    # Import settings (note that default debug settings are used)
    parser = argparse.ArgumentParser(description='TGS Challenge Main Script')
    parser.add_argument('--trn_path',
                        type=str,
                        default='./data/debug_train/',
                        help='path to training directory (default: debug)')
    parser.add_argument('--msk_path',
                        type=str,
                        default='./data/debug_masks',
                        help='path to mask directory (default: debug)')
    parser.add_argument('--tst_path',
                        type=str,
                        default='./data/debug_test/',
                        help='path to test directory (default: debug)')
    parser.add_argument('--mod_path',
                        type=str,
                        default='./weights/model_tmp/',
                        help='path to model weights directory (default: tmp)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=3,
                        help='input batch size (default: 3)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        help='number of epochs to train for (default: 10)')
    parser.add_argument('--starting_epoch',
                        type=int,
                        default=1,
                        help='index of starting epoch (default: 1)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='learning rate (default: 0.001)')
    parser.add_argument('--lr_patience',
                        type=int,
                        default=10,
                        help='num epochs to wait for LR reduce (default: 10)')
    parser.add_argument('--print_every',
                        type=int,
                        default=1,
                        help='num batches before printing (default: 1)')
    parser.add_argument('--NUM_TRAIN',
                        type=int,
                        default=6,
                        help='num samples in split train set (default: 6)')
    parser.add_argument('--NUM_FULL',
                        type=int,
                        default=9,
                        help='num samples in full train set (default: 9)')
    args = parser.parse_args()

    # Define some variables relative to parser inputs
    trn_path = args.trn_path
    msk_path = args.msk_path
    tst_path = args.tst_path
    mod_path = args.mod_path
    starting_epoch = args.starting_epoch
    NUM_TRAIN = args.NUM_TRAIN
    NUM_FULL = args.NUM_FULL

    record_name = 'best_record.pickle'
    history_name = 'training_history.pickle'

    # Validate specified model path
    restart_token = check_dir(mod_path)  # Returns None if path exists

    # Define model (comment out irrelevant models as necessary)
    # net = ResSeg33(ResidualBlock)
    # net = ResSeg33_Reg(ResBlock_Reg)
    # net = ResSegVar(ResidualBlock, [3, 4, 6, 3]) # 45 layers
    net = ResSegVar(ResBlock_Reg, [6, 8, 12, 6])  # 77 layers

    # Loss function
    criterion = nn.CrossEntropyLoss()
    # Optimizer
    optimizer = optim.Adam(net.parameters(), lr=args.lr)

    # Define or load training history
    def format_epoch_fname(start_num):
        return mod_path + 'epoch_%s.pth' % start_num

    best_record = {}
    training_history = {}
    if restart_token:  # Starting from scratch
        curr_epoch = 1
        best_record['epoch'] = 0
        best_record['val_loss'] = 1e10
        best_record['mean_iou'] = 0
    else:
        print 'Resuming training from epoch:', starting_epoch
        net.load_state_dict(torch.load(format_epoch_fname(starting_epoch)))
        curr_epoch = starting_epoch + 1
        best_record = load_pickle(mod_path + record_name)
        training_history = load_pickle(mod_path + history_name)

    # Define device and dtype
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    dtype = torch.float32
    # Parallelization init and set net to CUDA if possible
    if torch.cuda.is_available():
        net.cuda()
        net = torch.nn.DataParallel(net,
                                    device_ids=range(
                                        torch.cuda.device_count()))

    # Load data
    paths = (trn_path, msk_path, tst_path)
    stats = (NUM_TRAIN, NUM_FULL, args.batch_size)
    trn_set, val_set, tst_set = data_formatter(paths, stats)
    # Unpack data
    trn_data, trn_load = trn_set
    val_data, val_load = val_set
    tst_data, tst_load = tst_set

    # Define automatic LR reduction scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                     mode='min',
                                                     patience=args.lr_patience,
                                                     min_lr=1e-10)
    # Model API parameters
    param_dict = {
        'loader': None,
        'net': net,
        'criterion': criterion,
        'optimizer': optimizer,
        'epoch': 1,
        'args': args,
        'device': device,
        'dtype': dtype
    }
    # Note: epoch starts from 1, not 0
    for i, epoch in enumerate(range(curr_epoch, args.epochs + 1)):
        # Update epoch
        param_dict['epoch'] = epoch
        # Train
        param_dict['loader'] = trn_load
        trn_log = train(**param_dict)
        # Validate
        param_dict['loader'] = val_load
        val_loss, mean_iou = validate(**param_dict)

        # Update logging files
        training_history['epoch_%s' % (i + 1)] = trn_log
        # Save weights if avg_iou score improves
        if val_loss < best_record['val_loss']:
            best_record['epoch'] = epoch
            best_record['val_loss'] = val_loss
            best_record['mean_iou'] = mean_iou
            torch.save(net.state_dict(), format_epoch_fname(epoch))

        # Print best record information
        print '--------------------------------------'
        print 'best record: [epoch %d], [val_loss %.4f], [mean_iou %.4f]' % (
            best_record['epoch'], best_record['val_loss'],
            best_record['mean_iou'])
        print '--------------------------------------'
        print ''

        # Save logging information every epoch
        save_pickle(data=training_history, path=mod_path + history_name)
        save_pickle(data=best_record, path=mod_path + record_name)

        scheduler.step(val_loss)