def run_main(args): # Define parameters epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 dim_dnn_in = dim_au_out dim_dnn_out = 1 select_drug = args.drug na = args.missing_value data_path = args.data_path label_path = args.label_path test_size = args.test_size valid_size = args.valid_size g_disperson = args.var_genes_disp model_path = args.model_store_path pretrain_path = args.pretrain_path log_path = args.logging_file batch_size = args.batch_size encoder_hdims = args.ft_h_dims.split(",") preditor_hdims = args.p_h_dims.split(",") reduce_model = args.dimreduce prediction = args.predition encoder_hdims = list(map(int, encoder_hdims)) preditor_hdims = list(map(int, preditor_hdims)) # Read data data_r = pd.read_csv(data_path, index_col=0) label_r = pd.read_csv(label_path, index_col=0) label_r = label_r.fillna(na) now = time.strftime("%Y-%m-%d-%H-%M-%S") log_path = log_path + now + ".txt" log = open(log_path, "w") sys.stdout = log print(args) # data = data_r # Filter out na values selected_idx = label_r.loc[:, select_drug] != na if (g_disperson != None): hvg, adata = ut.highly_variable_genes(data_r, min_disp=g_disperson) # Rename columns if duplication exist data_r.columns = adata.var_names # Extract hvgs data = data_r.loc[selected_idx, hvg] else: data = data_r.loc[selected_idx, :] # Extract labels label = label_r.loc[selected_idx, select_drug] # Scaling data mmscaler = preprocessing.MinMaxScaler() lbscaler = preprocessing.MinMaxScaler() data = mmscaler.fit_transform(data) label = label.values.reshape(-1, 1) if prediction == "regression": label = lbscaler.fit_transform(label) dim_model_out = 1 else: le = LabelEncoder() label = le.fit_transform(label) dim_model_out = 2 #label = label.values.reshape(-1,1) print(np.std(data)) print(np.mean(data)) # Split traning valid test set X_train_all, X_test, Y_train_all, Y_test = train_test_split( data, label, test_size=test_size, random_state=42) X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_all, Y_train_all, test_size=valid_size, random_state=42) # up-sampling X_train, Y_train = SMOTE().fit_sample(X_train, Y_train) print(data.shape) print(label.shape) print(X_train.shape, Y_train.shape) print(X_test.shape, Y_test.shape) print(X_train.max()) print(X_train.min()) # Select the Training device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: print(device) torch.cuda.set_device(device) # Construct datasets and data loaders X_trainTensor = torch.FloatTensor(X_train).to(device) X_validTensor = torch.FloatTensor(X_valid).to(device) X_testTensor = torch.FloatTensor(X_test).to(device) X_allTensor = torch.FloatTensor(data).to(device) if prediction == "regression": Y_trainTensor = torch.FloatTensor(Y_train).to(device) Y_validTensor = torch.FloatTensor(Y_valid).to(device) else: Y_trainTensor = torch.LongTensor(Y_train).to(device) Y_validTensor = torch.LongTensor(Y_valid).to(device) train_dataset = TensorDataset(X_trainTensor, X_trainTensor) valid_dataset = TensorDataset(X_validTensor, X_validTensor) test_dataset = TensorDataset(X_testTensor, X_testTensor) all_dataset = TensorDataset(X_allTensor, X_allTensor) X_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) X_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) X_allDataLoader = DataLoader(dataset=all_dataset, batch_size=batch_size, shuffle=True) # construct TensorDataset trainreducedDataset = TensorDataset(X_trainTensor, Y_trainTensor) validreducedDataset = TensorDataset(X_validTensor, Y_validTensor) trainDataLoader_p = DataLoader(dataset=trainreducedDataset, batch_size=batch_size, shuffle=True) validDataLoader_p = DataLoader(dataset=validreducedDataset, batch_size=batch_size, shuffle=True) dataloaders_train = {'train': trainDataLoader_p, 'val': validDataLoader_p} if (bool(args.pretrain) == True): dataloaders_pretrain = { 'train': X_trainDataLoader, 'val': X_validDataLoader } if reduce_model == "AE": encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) elif reduce_model == "VAE": encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) #model = VAE(dim_au_in=data_r.shape[1],dim_au_out=128) if torch.cuda.is_available(): encoder.cuda() print(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) if reduce_model == "AE": encoder, loss_report_en = ut.train_extractor_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, loss_function=loss_function_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain_path) elif reduce_model == "VAE": encoder, loss_report_en = ut.train_VAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain_path) print("Pretrained finished") # Train model of predictor if reduce_model == "AE": model = PretrainedPredictor(input_dim=X_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=pretrain_path, freezed=bool(args.freeze_pretrain)) elif reduce_model == "VAE": model = PretrainedVAEPredictor(input_dim=X_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=pretrain_path, freezed=bool(args.freeze_pretrain)) print(model) if torch.cuda.is_available(): model.cuda() model.to(device) # Define optimizer optimizer = optim.Adam(model.parameters(), lr=1e-2) if prediction == "regression": loss_function = nn.MSELoss() else: loss_function = nn.CrossEntropyLoss() exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer) preditor_path = model_path + reduce_model + select_drug + '.pkl' load_model = os.path.exists(preditor_path) model, report = ut.train_predictor_model(model, dataloaders_train, optimizer, loss_function, epochs, exp_lr_scheduler, load=load_model, save_path=preditor_path) dl_result = model(X_testTensor).detach().cpu().numpy() #torch.save(model.feature_extractor.state_dict(), preditor_path+"encoder.pkl") print('Performances: R/Pearson/Mse/') if prediction == "regression": print(r2_score(dl_result, Y_test)) print(pearsonr(dl_result.flatten(), Y_test.flatten())) print(mean_squared_error(dl_result, Y_test)) else: lb_results = np.argmax(dl_result, axis=1) pb_results = np.max(dl_result, axis=1) print(classification_report(Y_test, lb_results)) print(average_precision_score(Y_test, pb_results)) print(roc_auc_score(Y_test, pb_results))