def cp_cnn_moa_train_prediction(self):

        print("Is GPU Available?")
        if torch.cuda.is_available():
            print("Yes, GPU is Available!!")
        else:
            print("No, GPU is NOT Available!!", "\n")

        DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
        no_of_components = 25
        NFOLDS = 5
        WEIGHT_DECAY = 1e-5
        EARLY_STOPPING_STEPS = 10
        EARLY_STOP = False
        hidden_size = 4096
        ##dir names
        model_file_name = "cp_1dcnn"
        model_dir_name = "cp_cnn_model"
        trn_pred_name = 'cp_train_preds_1dcnn'
        tst_pred_name = 'cp_test_preds_1dcnn'
        model_file_name,model_dir_name,trn_pred_name,tst_pred_name = \
        check_if_shuffle_data(self.shuffle, model_file_name, model_dir_name, trn_pred_name, tst_pred_name)
        model_dir = os.path.join(self.data_dir, model_dir_name)
        os.makedirs(model_dir, exist_ok=True)

        # Setup file names
        if self.shuffle:
            if self.subsample:
                input_train_file = os.path.join(
                    self.data_dir, "train_shuffle_lvl4_data_subsample.csv.gz")
                input_test_file = os.path.join(
                    self.data_dir, "test_lvl4_data_subsample.csv.gz")
            else:
                input_train_file = os.path.join(
                    self.data_dir, "train_shuffle_lvl4_data.csv.gz")
                input_test_file = os.path.join(self.data_dir,
                                               "test_lvl4_data.csv.gz")
        else:
            if self.subsample:
                input_train_file = os.path.join(
                    self.data_dir, "train_lvl4_data_subsample.csv.gz")
                input_test_file = os.path.join(
                    self.data_dir, "test_lvl4_data_subsample.csv.gz")
            else:
                input_train_file = os.path.join(self.data_dir,
                                                "train_lvl4_data.csv.gz")
                input_test_file = os.path.join(self.data_dir,
                                               "test_lvl4_data.csv.gz")

        if self.subsample:
            input_target_file = os.path.join(self.data_dir,
                                             'target_labels_subsample.csv')
        else:
            input_target_file = os.path.join(self.data_dir,
                                             'target_labels.csv')

        df_train = pd.read_csv(input_train_file,
                               compression='gzip',
                               low_memory=False)
        df_test = pd.read_csv(input_test_file,
                              compression='gzip',
                              low_memory=False)
        df_targets = pd.read_csv(input_target_file)

        metadata_cols = [
            'Metadata_broad_sample', 'Metadata_pert_id', 'Metadata_Plate',
            'Metadata_Well', 'Metadata_broad_id', 'Metadata_moa', 'broad_id',
            'pert_iname', 'moa', 'replicate_name', 'Metadata_dose_recode'
        ]

        target_cols = df_targets.columns[1:]
        df_train_x, df_train_y, df_test_x, df_test_y = split_data(
            df_train, df_test, metadata_cols, target_cols)
        features = df_train_x.columns.tolist()
        num_features = len(features) + no_of_components
        num_targets = len(target_cols)
        df_train = drug_stratification(df_train,
                                       NFOLDS,
                                       target_cols,
                                       col_name='replicate_name',
                                       cpd_freq_num=24)
        pos_weight = initialize_weights(df_train, target_cols, DEVICE)

        def model_train_pred(fold,
                             Model=CNN_Model,
                             df_train_y=df_train_y,
                             df_test_y=df_test_y,
                             features=features,
                             file_name=model_file_name):

            model_path = os.path.join(model_dir,
                                      file_name + f"_FOLD{fold}.pth")
            x_fold_train, y_fold_train, x_fold_val, y_fold_val, df_test_x_copy, val_idx = \
            preprocess(fold, df_train, df_train_x, df_train_y, df_test_x, no_of_components)
            train_dataset = TrainDataset(x_fold_train.values,
                                         y_fold_train.values)
            valid_dataset = TrainDataset(x_fold_val.values, y_fold_val.values)

            trainloader = torch.utils.data.DataLoader(
                train_dataset, batch_size=self.BATCH_SIZE, shuffle=True)
            validloader = torch.utils.data.DataLoader(
                valid_dataset, batch_size=self.BATCH_SIZE, shuffle=False)

            model = Model(num_features=num_features,
                          num_targets=num_targets,
                          hidden_size=hidden_size)
            model.to(DEVICE)
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=self.LEARNING_RATE,
                                         weight_decay=WEIGHT_DECAY,
                                         eps=1e-9)
            scheduler = optim.lr_scheduler.OneCycleLR(
                optimizer=optimizer,
                pct_start=0.1,
                div_factor=1e3,
                max_lr=1e-2,
                epochs=self.EPOCHS,
                steps_per_epoch=len(trainloader))
            loss_train = SmoothBCEwLogits(smoothing=0.001,
                                          pos_weight=pos_weight)
            loss_val = nn.BCEWithLogitsLoss()
            early_stopping_steps = EARLY_STOPPING_STEPS
            early_step = 0
            oof = np.zeros(df_train_y.shape)
            best_loss = np.inf
            best_loss_epoch = -1

            for epoch in range(self.EPOCHS):
                train_loss = train_fn(model, optimizer, scheduler, loss_train,
                                      trainloader, DEVICE)
                valid_loss, valid_preds = valid_fn(model, loss_val,
                                                   validloader, DEVICE)
                if valid_loss < best_loss:
                    best_loss = valid_loss
                    best_loss_epoch = epoch
                    oof[val_idx] = valid_preds
                    torch.save(model.state_dict(), model_path)
                elif (EARLY_STOP == True):
                    early_step += 1
                    if (early_step >= early_stopping_steps):
                        break
                print(
                    f"FOLD: {fold}, EPOCH: {epoch},train_loss: {train_loss:.6f},\
                valid_loss: {valid_loss:.6f} best_loss: {best_loss:.6f}, best_loss_epoch: {best_loss_epoch}"
                )

            #--------------------- PREDICTION---------------------
            testdataset = TestDataset(df_test_x_copy.values)
            testloader = torch.utils.data.DataLoader(
                testdataset, batch_size=self.BATCH_SIZE, shuffle=False)
            model = Model(num_features=num_features,
                          num_targets=num_targets,
                          hidden_size=hidden_size)
            model.load_state_dict(torch.load(model_path))
            model.to(DEVICE)

            predictions = np.zeros(df_test_y.shape)
            predictions = inference_fn(model, testloader, DEVICE)
            return oof, predictions

        def run_k_fold(NFOLDS, df_train_y=df_train_y, df_test_y=df_test_y):
            oof = np.zeros(df_train_y.shape)
            predictions = np.zeros(df_test_y.shape)
            for fold in range(NFOLDS):
                oof_, pred_ = model_train_pred(fold)
                predictions += pred_ / NFOLDS
                oof += oof_
            return oof, predictions

        oofs_, predictions_ = run_k_fold(NFOLDS)
        df_oofs = pd.DataFrame(oofs_, columns=df_train_y.columns)
        df_preds = pd.DataFrame(predictions_, columns=df_test_y.columns)

        model_eval_results(df_train_y, oofs_, df_test, df_test_y, df_preds,
                           target_cols)
        save_to_csv(df_preds, self.model_pred_dir,
                    f"{tst_pred_name}{self.output_file_indicator}.csv")
        save_to_csv(df_oofs,
                    self.model_pred_dir,
                    f"{trn_pred_name}{self.output_file_indicator}.csv.gz",
                    compress="gzip")
        print(
            "\n All is set, Train and Test predictions have been read as csv files into the model predictions directory!!"
        )
Esempio n. 2
0
    def L1000_nn_moa_train_prediction(self):

        print("Is GPU Available?")
        if torch.cuda.is_available():
            print("Yes, GPU is Available!!")
        else:
            print("No, GPU is NOT Available!!", "\n")

        DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
        no_of_compts = 50
        no_of_dims = 25
        IS_TRAIN = True
        NSEEDS = 5
        SEED = range(NSEEDS)
        NFOLDS = 5
        WEIGHT_DECAY = 1e-5
        EARLY_STOPPING_STEPS = 10
        EARLY_STOP = False
        hidden_size = 1024
        ##dir names
        model_file_name = "L1000_simplenn"
        model_dir_name = "L1000_simplenn_model"
        trn_pred_name = 'L1000_train_preds_simplenn'
        tst_pred_name = 'L1000_test_preds_simplenn'
        model_file_name,model_dir_name,trn_pred_name,tst_pred_name = \
        check_if_shuffle_data(self.shuffle, model_file_name, model_dir_name, trn_pred_name, tst_pred_name)
        model_dir = os.path.join(self.data_dir, model_dir_name)
        os.makedirs(model_dir, exist_ok=True)

        if self.shuffle:
            df_train = pd.read_csv(os.path.join(
                self.data_dir, 'train_shuffle_lvl4_data.csv.gz'),
                                   compression='gzip',
                                   low_memory=False)
        else:
            df_train = pd.read_csv(os.path.join(self.data_dir,
                                                'train_lvl4_data.csv.gz'),
                                   compression='gzip',
                                   low_memory=False)
        df_test = pd.read_csv(os.path.join(self.data_dir,
                                           'test_lvl4_data.csv.gz'),
                              compression='gzip',
                              low_memory=False)
        df_targets = pd.read_csv(
            os.path.join(self.data_dir, 'target_labels.csv'))

        metadata_cols = [
            'Metadata_broad_sample', 'pert_id', 'pert_idose', 'replicate_id',
            'pert_iname', 'moa', 'sig_id', 'det_plate', 'dose', 'det_well'
        ]
        target_cols = df_targets.columns[1:]
        df_train_x, df_train_y, df_test_x, df_test_y = split_data(
            df_train, df_test, metadata_cols, target_cols)
        df_train_x, df_test_x = umap_factor_features(df_train_x, df_test_x,
                                                     no_of_compts, no_of_dims)
        features = df_train_x.columns.tolist()
        num_features = len(features)
        num_targets = len(target_cols)
        df_train = drug_stratification(df_train,
                                       NFOLDS,
                                       target_cols,
                                       col_name='replicate_id',
                                       cpd_freq_num=24)
        pos_weight = initialize_weights(df_train, target_cols, DEVICE)

        def model_train_pred(fold, seed):

            seed_everything(seed)
            model_path = os.path.join(
                model_dir, model_file_name + f"_SEED{seed}_FOLD{fold}.pth")
            trn_idx = df_train[df_train['fold'] != fold].index
            val_idx = df_train[df_train['fold'] == fold].index

            x_fold_train = df_train_x.loc[trn_idx].reset_index(
                drop=True).copy()
            y_fold_train = df_train_y.loc[trn_idx].reset_index(
                drop=True).copy()

            x_fold_val = df_train_x.loc[val_idx].reset_index(drop=True).copy()
            y_fold_val = df_train_y.loc[val_idx].reset_index(drop=True).copy()
            df_test_x_copy = df_test_x.copy()
            x_fold_train, x_fold_val, df_test_x_copy = normalize(
                x_fold_train, x_fold_val, df_test_x_copy)

            train_dataset = TrainDataset(x_fold_train.values,
                                         y_fold_train.values)
            valid_dataset = TrainDataset(x_fold_val.values, y_fold_val.values)
            trainloader = torch.utils.data.DataLoader(
                train_dataset, batch_size=self.BATCH_SIZE, shuffle=True)
            validloader = torch.utils.data.DataLoader(
                valid_dataset, batch_size=self.BATCH_SIZE, shuffle=False)

            model = SimpleNN_Model(num_features=num_features,
                                   num_targets=num_targets,
                                   hidden_size=hidden_size)
            model.to(DEVICE)

            optimizer = torch.optim.Adam(model.parameters(),
                                         weight_decay=WEIGHT_DECAY,
                                         lr=self.LEARNING_RATE)
            scheduler = optim.lr_scheduler.OneCycleLR(
                optimizer=optimizer,
                pct_start=0.2,
                div_factor=1e3,
                max_lr=1e-2,
                epochs=self.EPOCHS,
                steps_per_epoch=len(trainloader))
            loss_train = SmoothBCEwLogits(smoothing=0.001)
            loss_val = nn.BCEWithLogitsLoss()
            early_stopping_steps = EARLY_STOPPING_STEPS
            early_step = 0

            oof = np.zeros(df_train_y.shape)
            best_loss = np.inf
            best_loss_epoch = -1

            if IS_TRAIN:
                for epoch in range(self.EPOCHS):
                    train_loss = train_fn(model, optimizer, scheduler,
                                          loss_train, trainloader, DEVICE)
                    valid_loss, valid_preds = valid_fn(model, loss_val,
                                                       validloader, DEVICE)
                    if valid_loss < best_loss:
                        best_loss = valid_loss
                        best_loss_epoch = epoch
                        oof[val_idx] = valid_preds
                        torch.save(model.state_dict(), model_path)
                    elif (EARLY_STOP == True):
                        early_step += 1
                        if (early_step >= early_stopping_steps):
                            break
                    if epoch % 10 == 0 or epoch == self.EPOCHS - 1:
                        print(f"seed: {seed}, FOLD: {fold}, EPOCH: {epoch},\
                        train_loss: {train_loss:.6f}, valid_loss: {valid_loss:.6f}, best_loss: {best_loss:.6f},\
                        best_loss_epoch: {best_loss_epoch}")

            #--------------------- PREDICTION---------------------
            testdataset = TestDataset(df_test_x_copy.values)
            testloader = torch.utils.data.DataLoader(
                testdataset, batch_size=self.BATCH_SIZE, shuffle=False)
            model = SimpleNN_Model(num_features=num_features,
                                   num_targets=num_targets,
                                   hidden_size=hidden_size)
            model.load_state_dict(torch.load(model_path))
            model.to(DEVICE)

            if not IS_TRAIN:
                valid_loss, valid_preds = valid_fn(model, loss_fn, validloader,
                                                   DEVICE)
                oof[val_idx] = valid_preds
            predictions = np.zeros(df_test_y.shape)
            predictions = inference_fn(model, testloader, DEVICE)
            return oof, predictions

        def run_k_fold(folds, seed):
            oof = np.zeros(df_train_y.shape)
            predictions = np.zeros(df_test_y.shape)
            for fold in range(folds):
                oof_, pred_ = model_train_pred(fold, seed)
                predictions += pred_ / folds
                oof += oof_
            return oof, predictions

        oofs = np.zeros(df_train_y.shape)
        predictions = np.zeros(df_test_y.shape)
        time_start = time.time()
        for seed in SEED:
            oofs_, predictions_ = run_k_fold(NFOLDS, seed)
            oofs += oofs_ / len(SEED)
            predictions += predictions_ / len(SEED)
            print(f"elapsed time: {time.time() - time_start}")
        df_oofs = pd.DataFrame(oofs, columns=df_train_y.columns)
        df_preds = pd.DataFrame(predictions, columns=df_test_y.columns)

        model_eval_results(df_train_y, oofs, df_test, df_test_y, df_preds,
                           target_cols)
        save_to_csv(df_preds, self.model_pred_dir, f"{tst_pred_name}.csv")
        save_to_csv(df_oofs,
                    self.model_pred_dir,
                    f"{trn_pred_name}.csv.gz",
                    compress="gzip")
        print(
            "\n All is set, Train and Test predictions have been read as csv files into the model predictions directory!!"
        )
    def L1000_tabnet_moa_train_pred(self):

        print("Is GPU Available?")
        if torch.cuda.is_available():
            print("Yes, GPU is Available!!")
        else:
            print("No, GPU is NOT Available!!", "\n")

        DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
        no_of_components = 25
        NFOLDS = 5
        ##dir names
        model_file_name = "L1000_tabnet"
        model_dir_name = "L1000_tabnet_model"
        trn_pred_name = 'L1000_train_preds_tabnet'
        tst_pred_name = 'L1000_test_preds_tabnet'
        model_file_name,model_dir_name,trn_pred_name,tst_pred_name = \
        check_if_shuffle_data(self.shuffle, model_file_name, model_dir_name, trn_pred_name, tst_pred_name)
        model_dir = os.path.join(self.data_dir, model_dir_name)
        os.makedirs(model_dir, exist_ok=True)

        if self.shuffle:
            df_train = pd.read_csv(os.path.join(
                self.data_dir, 'train_shuffle_lvl4_data.csv.gz'),
                                   compression='gzip',
                                   low_memory=False)
        else:
            df_train = pd.read_csv(os.path.join(self.data_dir,
                                                'train_lvl4_data.csv.gz'),
                                   compression='gzip',
                                   low_memory=False)
        df_test = pd.read_csv(os.path.join(self.data_dir,
                                           'test_lvl4_data.csv.gz'),
                              compression='gzip',
                              low_memory=False)
        df_targets = pd.read_csv(
            os.path.join(self.data_dir, 'target_labels.csv'))

        metadata_cols = [
            'Metadata_broad_sample', 'pert_id', 'pert_idose', 'replicate_id',
            'pert_iname', 'moa', 'sig_id', 'det_plate', 'dose', 'det_well'
        ]

        target_cols = df_targets.columns[1:]
        df_train_x, df_train_y, df_test_x, df_test_y = split_data(
            df_train, df_test, metadata_cols, target_cols)
        df_train_x = add_stat_feats(df_train_x)
        df_test_x = add_stat_feats(df_test_x)

        df_train = drug_stratification(df_train,
                                       NFOLDS,
                                       target_cols,
                                       col_name='replicate_id',
                                       cpd_freq_num=24)
        pos_weight = initialize_weights(df_train, target_cols, DEVICE)
        wgt_bce = dp(F.binary_cross_entropy_with_logits)
        wgt_bce.__defaults__ = (None, None, None, 'mean', pos_weight)

        def model_train_pred(fold):

            model_path = os.path.join(model_dir,
                                      model_file_name + f"_FOLD{fold}.pth")
            tabnet_params = dict(n_d=64,
                                 n_a=128,
                                 n_steps=1,
                                 gamma=1.3,
                                 lambda_sparse=0,
                                 n_independent=2,
                                 n_shared=1,
                                 optimizer_fn=optim.Adam,
                                 optimizer_params=dict(lr=self.LEARNING_RATE,
                                                       weight_decay=1e-5),
                                 mask_type="entmax",
                                 scheduler_params=dict(mode="min",
                                                       patience=10,
                                                       min_lr=1e-5,
                                                       factor=0.9),
                                 scheduler_fn=ReduceLROnPlateau,
                                 verbose=10)

            x_fold_train, y_fold_train, x_fold_val, y_fold_val, df_test_x_copy, val_idx = \
            preprocess(fold, df_train, df_train_x, df_train_y, df_test_x, no_of_components)
            x_fold_train, x_fold_val, df_test_x_copy = variance_threshold(
                x_fold_train, x_fold_val, df_test_x_copy)

            ### Fit ###
            model = TabNetRegressor(**tabnet_params)
            model.fit(X_train=x_fold_train.values,
                      y_train=y_fold_train.values,
                      eval_set=[(x_fold_val.values, y_fold_val.values)],
                      eval_name=["val"],
                      eval_metric=["logits_ll"],
                      max_epochs=self.EPOCHS,
                      patience=40,
                      batch_size=self.BATCH_SIZE,
                      virtual_batch_size=32,
                      num_workers=1,
                      drop_last=False,
                      loss_fn=SmoothBCEwLogits(smoothing=0.001,
                                               pos_weight=pos_weight))

            ###---- Prediction ---
            oof = np.zeros(df_train_y.shape)
            valid_preds = 1 / (1 + np.exp(-model.predict(x_fold_val.values)))
            oof[val_idx] = valid_preds
            predictions = 1 / (1 +
                               np.exp(-model.predict(df_test_x_copy.values)))
            model_path = model.save_model(model_path)
            return oof, predictions

        def run_k_fold(NFOLDS, df_train_y=df_train_y, df_test_y=df_test_y):
            oof = np.zeros(df_train_y.shape)
            predictions = np.zeros(df_test_y.shape)
            for fold in range(NFOLDS):
                oof_, pred_ = model_train_pred(fold)
                predictions += pred_ / NFOLDS
                oof += oof_
            return oof, predictions

        oofs_, predictions_ = run_k_fold(NFOLDS)
        df_oofs = pd.DataFrame(oofs_, columns=df_train_y.columns)
        df_preds = pd.DataFrame(predictions_, columns=df_test_y.columns)

        model_eval_results(df_train_y, oofs_, df_test, df_test_y, df_preds,
                           target_cols)
        save_to_csv(df_preds, self.model_pred_dir, f"{tst_pred_name}.csv")
        save_to_csv(df_oofs,
                    self.model_pred_dir,
                    f"{trn_pred_name}.csv.gz",
                    compress="gzip")
        print(
            "\n All is set, Train and Test predictions have been read as csv files into the model predictions directory!!"
        )