def load_checkpoint(path: str, device: torch.device = None, logger: logging.Logger = None) -> MoleculeModel: """ Loads a model checkpoint. :param path: Path where checkpoint is saved. :param device: Device where the model will be moved. :param logger: A logger for recording output. :return: The loaded :class:`~chemprop.models.model.MoleculeModel`. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Load model and args state = torch.load(path, map_location=lambda storage, loc: storage) args = TrainArgs() args.from_dict(vars(state['args']), skip_unsettable=True) loaded_state_dict = state['state_dict'] if device is not None: args.device = device # Build model model = MoleculeModel(args) model_state_dict = model.state_dict() # Skip missing parameters and parameters of mismatched size pretrained_state_dict = {} for loaded_param_name in loaded_state_dict.keys(): # Backward compatibility for parameter names if re.match(r'(encoder\.encoder\.)([Wc])', loaded_param_name): param_name = loaded_param_name.replace('encoder.encoder', 'encoder.encoder.0') else: param_name = loaded_param_name # Load pretrained parameter, skipping unmatched parameters if param_name not in model_state_dict: info(f'Warning: Pretrained parameter "{loaded_param_name}" cannot be found in model parameters.') elif model_state_dict[param_name].shape != loaded_state_dict[loaded_param_name].shape: info(f'Warning: Pretrained parameter "{loaded_param_name}" ' f'of shape {loaded_state_dict[loaded_param_name].shape} does not match corresponding ' f'model parameter of shape {model_state_dict[param_name].shape}.') else: debug(f'Loading pretrained parameter "{loaded_param_name}".') pretrained_state_dict[param_name] = loaded_state_dict[loaded_param_name] # Load pretrained weights model_state_dict.update(pretrained_state_dict) model.load_state_dict(model_state_dict) if args.cuda: debug('Moving model to cuda') model = model.to(args.device) return model
def load_checkpoint(path: str, device: torch.device = None, logger: logging.Logger = None) -> MoleculeModel: """ Loads a model checkpoint. :param path: Path where checkpoint is saved. :param device: Device where the model will be moved. :param logger: A logger. :return: The loaded MoleculeModel. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Load model and args state = torch.load(path, map_location=lambda storage, loc: storage) args = TrainArgs() args.from_dict(vars(state['args']), skip_unsettable=True) loaded_state_dict = state['state_dict'] if device is not None: args.device = device # Build model model = build_model(args) model_state_dict = model.state_dict() # Skip missing parameters and parameters of mismatched size pretrained_state_dict = {} for param_name in loaded_state_dict.keys(): if param_name not in model_state_dict: info( f'Warning: Pretrained parameter "{param_name}" cannot be found in model parameters.' ) elif model_state_dict[param_name].shape != loaded_state_dict[ param_name].shape: info( f'Warning: Pretrained parameter "{param_name}" ' f'of shape {loaded_state_dict[param_name].shape} does not match corresponding ' f'model parameter of shape {model_state_dict[param_name].shape}.' ) else: debug(f'Loading pretrained parameter "{param_name}".') pretrained_state_dict[param_name] = loaded_state_dict[param_name] # Load pretrained weights model_state_dict.update(pretrained_state_dict) model.load_state_dict(model_state_dict) if args.cuda: debug('Moving model to cuda') model = model.to(args.device) return model
def cross_validate(args: TrainArgs, logger: Logger = None) -> Tuple[float, float]: """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir task_names = args.target_columns or get_task_names(args.data_path) # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores, uncertainty_estimator = run_training(args, logger) # Save one model for each fold if uncertainty_estimator: process_estimator(uncertainty_estimator, logger, args, fold_num) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) if args.show_individual_scores: for task_name, score in zip(task_names, scores): info( f'Seed {init_seed + fold_num} ==> test {task_name} {args.metric} = {score:.6f}' ) # Report scores across models avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(task_names): info( f'Overall test {task_name} {args.metric} = ' f'{np.nanmean(all_scores[:, task_num]):.6f} +/- {np.nanstd(all_scores[:, task_num]):.6f}' ) return mean_score, std_score
def load_args(path: str) -> TrainArgs: """ Loads the arguments a model was trained with. :param path: Path where model checkpoint is saved. :return: The arguments that the model was trained with. """ args = TrainArgs() args.from_dict(vars(torch.load(path, map_location=lambda storage, loc: storage)['args']), skip_unsettable=True) return args
def save_checkpoint(path: str, model: MoleculeModel, scaler: StandardScaler = None, features_scaler: StandardScaler = None, args: TrainArgs = None): """ Saves a model checkpoint. :param model: A MoleculeModel. :param scaler: A StandardScaler fitted on the data. :param features_scaler: A StandardScaler fitted on the features. :param args: Arguments. :param path: Path where checkpoint will be saved. """ # Convert args to namespace for backwards compatibility if args is not None: args = Namespace(**args.as_dict()) state = { 'args': args, 'state_dict': model.state_dict(), 'data_scaler': { 'means': scaler.means, 'stds': scaler.stds } if scaler is not None else None, 'features_scaler': { 'means': features_scaler.means, 'stds': features_scaler.stds } if features_scaler is not None else None } torch.save(state, path)
def chemprop_train() -> None: """Runs chemprop training.""" args = TrainArgs().parse_args() logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) cross_validate(args, logger)
def build_model(args: TrainArgs) -> MoleculeModel: """ Builds a MoleculeModel, which is a message passing neural network + feed-forward layers. :param args: Arguments. :return: A MoleculeModel containing the MPN encoder along with final linear layers with parameters initialized. """ output_size = args.num_tasks args.output_size = output_size if args.dataset_type == 'multiclass': args.output_size *= args.multiclass_num_classes model = MoleculeModel( classification=args.dataset_type == 'classification', multiclass=args.dataset_type == 'multiclass' ) model.create_encoder(args) model.create_ffn(args) # Check hasattr for compatibility with loaded args from previous versions of chemprop if hasattr(args, 'pytorch_seed') and args.pytorch_seed is not None: torch.manual_seed(args.pytorch_seed) initialize_weights(model) return model
def save_checkpoint(path: str, model: MoleculeModel, scaler: StandardScaler = None, features_scaler: StandardScaler = None, args: TrainArgs = None) -> None: """ Saves a model checkpoint. :param model: A :class:`~chemprop.models.model.MoleculeModel`. :param scaler: A :class:`~chemprop.data.scaler.StandardScaler` fitted on the data. :param features_scaler: A :class:`~chemprop.data.scaler.StandardScaler` fitted on the features. :param args: The :class:`~chemprop.args.TrainArgs` object containing the arguments the model was trained with. :param path: Path where checkpoint will be saved. """ # Convert args to namespace for backwards compatibility if args is not None: args = Namespace(**args.as_dict()) state = { 'args': args, 'state_dict': model.state_dict(), 'data_scaler': { 'means': scaler.means, 'stds': scaler.stds } if scaler is not None else None, 'features_scaler': { 'means': features_scaler.means, 'stds': features_scaler.stds } if features_scaler is not None else None } torch.save(state, path)
def cross_validate_mechine(args: TrainArgs, logger: Logger = None): """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold dmpnn_scores = [] for fold_num in range(args.num_folds): if args.dataset_type == 'classification': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/test.csv' elif args.dataset_type == 'regression': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/test.csv' info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores,model,scaler,df = run_training(args, logger) if args.loss_save: df.to_csv('/home/cxw/python——work/paper_gcn/dmpnn_epoch_loss/'+args.protein+'loss.csv',index=None) # df.to_csv(args.protein+'loss.csv',index=None) break dmpnn_scores.append(model_scores) train_target, train_feature, val_target, val_feature, test_target, test_feature,train_smiles,val_smiles,test_smiles,test_preds = get_xgboost_feature(args, logger,model) train_target = pd.DataFrame(train_target) train_feature = pd.DataFrame(train_feature) val_target = pd.DataFrame(val_target) val_feature = pd.DataFrame(val_feature) test_target = pd.DataFrame(test_target) test_feature = pd.DataFrame(test_feature) train_morgan_feature = get_morgan_feature(train_smiles) val_morgan_feature = get_morgan_feature(val_smiles) test_morgan_feature = get_morgan_feature(test_smiles) if args.dataset_type == 'classification': if test_target.shape[1]==1: scores = svm_knn_rf_class(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) else: scores = svm_knn_rf_class_more(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) scores.columns = ['type','auc'] elif args.dataset_type == 'regression': if test_target.shape[1]==1: scores = svm_knn_rf_regre(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) else: scores = svm_knn_rf_regre_more(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) scores.columns = ['type', 'RMSE'] scores.to_csv(args.protein+'mechine_scores.csv')
def chemprop_train() -> None: """Parses Chemprop training arguments and trains (cross-validates) a Chemprop model. This is the entry point for the command line command :code:`chemprop_train`. """ args = TrainArgs().parse_args() logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) cross_validate(args, logger)
def train_outside(args_dict): """ Used for calling this script from another python script. :dict args_dict: dict of args to use """ sys.argv = create_args(args_dict, 'train.py') args = TrainArgs().parse_args() logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) cross_validate(args, logger)
def save_checkpoint( path: str, model: MoleculeModel, scaler: StandardScaler = None, features_scaler: StandardScaler = None, atom_descriptor_scaler: StandardScaler = None, bond_feature_scaler: StandardScaler = None, args: TrainArgs = None, ) -> None: """ Saves a model checkpoint. :param model: A :class:`~chemprop.models.model.MoleculeModel`. :param scaler: A :class:`~chemprop.data.scaler.StandardScaler` fitted on the data. :param features_scaler: A :class:`~chemprop.data.scaler.StandardScaler` fitted on the features. :param atom_descriptor_scaler: A :class:`~chemprop.data.scaler.StandardScaler` fitted on the atom descriptors. :param bond_feature_scaler: A :class:`~chemprop.data.scaler.StandardScaler` fitted on the bond_fetaures. :param args: The :class:`~chemprop.args.TrainArgs` object containing the arguments the model was trained with. :param path: Path where checkpoint will be saved. """ # Convert args to namespace for backwards compatibility if args is not None: args = Namespace(**args.as_dict()) data_scaler = { "means": scaler.means, "stds": scaler.stds } if scaler is not None else None if features_scaler is not None: features_scaler = { "means": features_scaler.means, "stds": features_scaler.stds } if atom_descriptor_scaler is not None: atom_descriptor_scaler = { "means": atom_descriptor_scaler.means, "stds": atom_descriptor_scaler.stds, } if bond_feature_scaler is not None: bond_feature_scaler = { "means": bond_feature_scaler.means, "stds": bond_feature_scaler.stds } state = { "args": args, "state_dict": model.state_dict(), "data_scaler": data_scaler, "features_scaler": features_scaler, "atom_descriptor_scaler": atom_descriptor_scaler, "bond_feature_scaler": bond_feature_scaler, } torch.save(state, path)
import os import torch # checks print('working directory is:') print(os.getcwd()) print('is CUDA available?') print(torch.cuda.is_available()) # imports from chemprop.args import TrainArgs from chemprop.train.run_training import run_training # instantiate args class and load from dict args = TrainArgs() args.from_dict({ 'dataset_type': 'regression', 'data_path': '/home/willlamb/chempropBayes/data/qm9.csv' }) ##################### ARGS ##################### # architecture args.hidden_size = 500 #args.depth = 5 args.ffn_num_layers = 3 args.activation = 'ReLU' args.ffn_hidden_size = args.hidden_size args.features_path = None args.features_generator = None
def run_training(args: TrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`args.metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data( path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, atom_descriptors_path=args.separate_test_atom_descriptors_path, bond_features_path=args.separate_test_bond_features_path, phase_features_path=args.separate_test_phase_features_path, smiles_columns=args.smiles_columns, logger=logger) if args.separate_val_path: val_data = get_data( path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, atom_descriptors_path=args.separate_val_atom_descriptors_path, bond_features_path=args.separate_val_bond_features_path, phase_features_path=args.separate_val_phase_features_path, smiles_columns=args.smiles_columns, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits( data_path=args.data_path, save_dir=args.save_dir, task_names=args.task_names, features_path=args.features_path, train_data=train_data, val_data=val_data, test_data=test_data, smiles_columns=args.smiles_columns, logger=logger, ) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None if args.atom_descriptor_scaling and args.atom_descriptors is not None: atom_descriptor_scaler = train_data.normalize_features( replace_nan_token=0, scale_atom_descriptors=True) val_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) else: atom_descriptor_scaler = None if args.bond_feature_scaling and args.bond_features_size > 0: bond_feature_scaler = train_data.normalize_features( replace_nan_token=0, scale_bond_features=True) val_data.normalize_features(bond_feature_scaler, scale_bond_features=True) test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) else: bond_feature_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') scaler = train_data.normalize_targets() elif args.dataset_type == 'spectra': debug( 'Normalizing spectra and excluding spectra regions based on phase') args.spectra_phase_mask = load_phase_mask(args.spectra_phase_mask_path) for dataset in [train_data, test_data, val_data]: data_targets = normalize_spectra( spectra=dataset.targets(), phase_features=dataset.phase_features(), phase_mask=args.spectra_phase_mask, excluded_sub_value=None, threshold=args.spectra_target_floor, ) dataset.set_targets(data_targets) scaler = None else: scaler = None # Get loss function loss_func = get_loss_func(args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: set_cache_graph(True) num_workers = 0 else: set_cache_graph(False) num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers) if args.class_balance: debug( f'With class_balance, effective train size = {train_data_loader.iter_size:,}' ) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) # Optionally, overwrite weights: if args.checkpoint_frzn is not None: debug( f'Loading and freezing parameters from {args.checkpoint_frzn}.' ) model = load_frzn_model(model=model, path=args.checkpoint_frzn, current_args=args, logger=logger) debug(model) if args.checkpoint_frzn is not None: debug(f'Number of unfrozen parameters = {param_count(model):,}') debug(f'Total number of parameters = {param_count_all(model):,}') else: debug(f'Number of parameters = {param_count_all(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, scaler=scaler, logger=logger) for metric, scores in val_scores.items(): # Average validation score avg_val_score = np.nanmean(scores) debug(f'Validation {metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, scores): debug( f'Validation {task_name} {metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{metric}', val_score, n_iter) # Save model checkpoint if improved validation score avg_val_score = np.nanmean(val_scores[args.metric]) if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args) # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score for metric, scores in test_scores.items(): avg_test_score = np.nanmean(scores) info(f'Model {model_idx} test {metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{metric}', avg_test_score, 0) if args.show_individual_scores and args.dataset_type != 'spectra': # Individual test scores for task_name, test_score in zip(args.task_names, scores): info( f'Model {model_idx} test {task_name} {metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{metric}', test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger) for metric, scores in ensemble_scores.items(): # Average ensemble score avg_ensemble_test_score = np.nanmean(scores) info(f'Ensemble test {metric} = {avg_ensemble_test_score:.6f}') # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, scores): info( f'Ensemble test {task_name} {metric} = {ensemble_score:.6f}' ) # Save scores with open(os.path.join(args.save_dir, 'test_scores.json'), 'w') as f: json.dump(ensemble_scores, f, indent=4, sort_keys=True) # Optionally save test preds if args.save_preds: test_preds_dataframe = pd.DataFrame( data={'smiles': test_data.smiles()}) for i, task_name in enumerate(args.task_names): test_preds_dataframe[task_name] = [ pred[i] for pred in avg_test_preds ] test_preds_dataframe.to_csv(os.path.join(args.save_dir, 'test_preds.csv'), index=False) return ensemble_scores
import os import torch # checks print('working directory is:') print(os.getcwd()) print('is CUDA available?') print(torch.cuda.is_available()) # imports from chemprop.args import TrainArgs from chemprop.train.pdts import pdts # instantiate args class and load from dict args = TrainArgs() args.from_dict({ 'dataset_type': 'regression', 'data_path': '/home/willlamb/chempropBayes/data/qm9.csv' }) ##################### ARGS ##################### # architecture args.hidden_size = 500 args.depth = 5 args.ffn_num_layers = 3 args.activation = 'ReLU' args.ffn_hidden_size = args.hidden_size args.features_path = None
def cross_validate(args: TrainArgs, train_func: Callable[[TrainArgs, MoleculeDataset, Logger], Dict[str, List[float]]] ) -> Tuple[float, float]: """ Runs k-fold cross-validation. For each of k splits (folds) of the data, trains and tests a model on that split and aggregates the performance across folds. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param train_func: Function which runs training. :return: A tuple containing the mean and standard deviation performance across folds. """ logger = create_logger(name=TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet) if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir args.task_names = get_task_names(path=args.data_path, smiles_columns=args.smiles_columns, target_columns=args.target_columns, ignore_columns=args.ignore_columns) # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args makedirs(args.save_dir) try: args.save(os.path.join(args.save_dir, 'args.json')) except subprocess.CalledProcessError: debug('Could not write the reproducibility section of the arguments to file, thus omitting this section.') args.save(os.path.join(args.save_dir, 'args.json'), with_reproducibility=False) # set explicit H option and reaction option reset_featurization_parameters(logger=logger) set_explicit_h(args.explicit_h) set_adding_hs(args.adding_h) if args.reaction: set_reaction(args.reaction, args.reaction_mode) elif args.reaction_solvent: set_reaction(True, args.reaction_mode) # Get data debug('Loading data') data = get_data( path=args.data_path, args=args, logger=logger, skip_none_targets=True, data_weights_path=args.data_weights_path ) validate_dataset_type(data, dataset_type=args.dataset_type) args.features_size = data.features_size() if args.atom_descriptors == 'descriptor': args.atom_descriptors_size = data.atom_descriptors_size() args.ffn_hidden_size += args.atom_descriptors_size elif args.atom_descriptors == 'feature': args.atom_features_size = data.atom_features_size() set_extra_atom_fdim(args.atom_features_size) if args.bond_features_path is not None: args.bond_features_size = data.bond_features_size() set_extra_bond_fdim(args.bond_features_size) debug(f'Number of tasks = {args.num_tasks}') if args.target_weights is not None and len(args.target_weights) != args.num_tasks: raise ValueError('The number of provided target weights must match the number and order of the prediction tasks') # Run training on different random seeds for each fold all_scores = defaultdict(list) for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) data.reset_features_and_targets() # If resuming experiment, load results from trained models test_scores_path = os.path.join(args.save_dir, 'test_scores.json') if args.resume_experiment and os.path.exists(test_scores_path): print('Loading scores') with open(test_scores_path) as f: model_scores = json.load(f) # Otherwise, train the models else: model_scores = train_func(args, data, logger) for metric, scores in model_scores.items(): all_scores[metric].append(scores) all_scores = dict(all_scores) # Convert scores to numpy arrays for metric, scores in all_scores.items(): all_scores[metric] = np.array(scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold contains_nan_scores = False for fold_num in range(args.num_folds): for metric, scores in all_scores.items(): info(f'\tSeed {init_seed + fold_num} ==> test {metric} = {multitask_mean(scores[fold_num], metric):.6f}') if args.show_individual_scores: for task_name, score in zip(args.task_names, scores[fold_num]): info(f'\t\tSeed {init_seed + fold_num} ==> test {task_name} {metric} = {score:.6f}') if np.isnan(score): contains_nan_scores = True # Report scores across folds for metric, scores in all_scores.items(): avg_scores = multitask_mean(scores, axis=1, metric=metric) # average score for each model across tasks mean_score, std_score = np.mean(avg_scores), np.std(avg_scores) info(f'Overall test {metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(args.task_names): info(f'\tOverall test {task_name} {metric} = ' f'{np.mean(scores[:, task_num]):.6f} +/- {np.std(scores[:, task_num]):.6f}') if contains_nan_scores: info("The metric scores observed for some fold test splits contain 'nan' values. \ This can occur when the test set does not meet the requirements \ for a particular metric, such as having no valid instances of one \ task in the test set or not having positive examples for some classification metrics. \ Before v1.5.1, the default behavior was to ignore nan values in individual folds or tasks \ and still return an overall average for the remaining folds or tasks. The behavior now \ is to include them in the average, converting overall average metrics to 'nan' as well.") # Save scores with open(os.path.join(save_dir, TEST_SCORES_FILE_NAME), 'w') as f: writer = csv.writer(f) header = ['Task'] for metric in args.metrics: header += [f'Mean {metric}', f'Standard deviation {metric}'] + \ [f'Fold {i} {metric}' for i in range(args.num_folds)] writer.writerow(header) if args.dataset_type == 'spectra': # spectra data type has only one score to report row = ['spectra'] for metric, scores in all_scores.items(): task_scores = scores[:,0] mean, std = np.mean(task_scores), np.std(task_scores) row += [mean, std] + task_scores.tolist() writer.writerow(row) else: # all other data types, separate scores by task for task_num, task_name in enumerate(args.task_names): row = [task_name] for metric, scores in all_scores.items(): task_scores = scores[:, task_num] mean, std = np.mean(task_scores), np.std(task_scores) row += [mean, std] + task_scores.tolist() writer.writerow(row) # Determine mean and std score of main metric avg_scores = multitask_mean(all_scores[args.metric], metric=args.metric, axis=1) mean_score, std_score = np.mean(avg_scores), np.std(avg_scores) # Optionally merge and save test preds if args.save_preds: all_preds = pd.concat([pd.read_csv(os.path.join(save_dir, f'fold_{fold_num}', 'test_preds.csv')) for fold_num in range(args.num_folds)]) all_preds.to_csv(os.path.join(save_dir, 'test_preds.csv'), index=False) return mean_score, std_score
def run_training(args: TrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`args.metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Split data debug(f"Splitting data with seed {args.seed}") # if args.separate_test_path: # test_data = get_data( # path=args.separate_test_path, # args=args, # features_path=args.separate_test_features_path, # atom_descriptors_path=args.separate_test_atom_descriptors_path, # bond_features_path=args.separate_test_bond_features_path, # smiles_columns=args.smiles_columns, # logger=logger, # ) # if args.separate_val_path: # val_data = get_data( # path=args.separate_val_path, # args=args, # features_path=args.separate_val_features_path, # atom_descriptors_path=args.separate_val_atom_descriptors_path, # bond_features_path=args.separate_val_bond_features_path, # smiles_columns=args.smiles_columns, # logger=logger, # ) # if args.separate_val_path and args.separate_test_path: # train_data = data # elif args.separate_val_path: # train_data, _, test_data = split_data( # data=data, # split_type=args.split_type, # sizes=(0.8, 0.0, 0.2), # seed=args.seed, # num_folds=args.num_folds, # args=args, # logger=logger, # ) # elif args.separate_test_path: # train_data, val_data, _ = split_data( # data=data, # split_type=args.split_type, # sizes=(0.8, 0.2, 0.0), # seed=args.seed, # num_folds=args.num_folds, # args=args, # logger=logger, # ) # else: # Default train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, num_folds=args.num_folds, args=args, logger=logger, ) if args.dataset_type == "classification": class_sizes = get_class_sizes(data) debug("Class sizes") for i, task_class_sizes in enumerate(class_sizes): debug( f"{args.task_names[i]} " f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits( data_path=args.data_path, save_dir=args.save_dir, task_names=args.task_names, features_path=args.features_path, train_data=train_data, val_data=val_data, test_data=test_data, smiles_columns=args.smiles_columns, ) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None if args.atom_descriptor_scaling and args.atom_descriptors is not None: atom_descriptor_scaler = train_data.normalize_features( replace_nan_token=0, scale_atom_descriptors=True) val_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) else: atom_descriptor_scaler = None if args.bond_feature_scaling and args.bond_features_size > 0: bond_feature_scaler = train_data.normalize_features( replace_nan_token=0, scale_bond_features=True) val_data.normalize_features(bond_feature_scaler, scale_bond_features=True) test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) else: bond_feature_scaler = None args.train_data_size = len(train_data) debug( f"Total size = {len(data):,} | " f"train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}" ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == "regression": debug("Fitting scaler") scaler = train_data.normalize_targets() else: scaler = None # Get loss function loss_func = get_loss_func(args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == "multiclass": sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: set_cache_graph(True) num_workers = 0 else: set_cache_graph(False) num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader( dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, class_balance=args.class_balance, shuffle=True, seed=args.seed, ) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers) if args.class_balance: debug( f"With class_balance, effective train size = {train_data_loader.iter_size:,}" ) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f"model_{model_idx}") makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f"Loading model {model_idx} from {args.checkpoint_paths[model_idx]}" ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f"Building model {model_idx}") model = MoleculeModel(args) debug(model) debug(f"Number of parameters = {param_count(model):,}") if args.cuda: debug("Moving model to cuda") model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint( os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args, ) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float("inf") if args.minimize_score else -float("inf") best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f"Epoch {epoch}") n_iter = train( model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer, ) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate( model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, scaler=scaler, logger=logger, ) for metric, scores in val_scores.items(): # Average validation score avg_val_score = np.nanmean(scores) debug(f"Validation {metric} = {avg_val_score:.6f}") writer.add_scalar(f"validation_{metric}", avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, scores): debug( f"Validation {task_name} {metric} = {val_score:.6f}" ) writer.add_scalar(f"validation_{task_name}_{metric}", val_score, n_iter) # Save model checkpoint if improved validation score avg_val_score = np.nanmean(val_scores[args.metric]) if (args.minimize_score and avg_val_score < best_score or not args.minimize_score and avg_val_score > best_score): best_score, best_epoch = avg_val_score, epoch save_checkpoint( os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args, ) # Evaluate on test set using model with best validation score info( f"Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}" ) model = load_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions( preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger, ) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score for metric, scores in test_scores.items(): avg_test_score = np.nanmean(scores) info(f"Model {model_idx} test {metric} = {avg_test_score:.6f}") writer.add_scalar(f"test_{metric}", avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, scores): info( f"Model {model_idx} test {task_name} {metric} = {test_score:.6f}" ) writer.add_scalar(f"test_{task_name}_{metric}", test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions( preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger, ) for metric, scores in ensemble_scores.items(): # Average ensemble score avg_ensemble_test_score = np.nanmean(scores) info(f"Ensemble test {metric} = {avg_ensemble_test_score:.6f}") # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, scores): info( f"Ensemble test {task_name} {metric} = {ensemble_score:.6f}" ) # Optionally save test preds if args.save_preds: test_preds_dataframe = pd.DataFrame( data={"smiles": test_data.smiles()}) for i, task_name in enumerate(args.task_names): test_preds_dataframe[task_name] = [ pred[i] for pred in avg_test_preds ] test_preds_dataframe.to_csv(os.path.join(args.save_dir, "test_preds.csv"), index=False) return ensemble_scores
def cross_validate(args: TrainArgs, logger: Logger = None) -> Tuple[float, float]: """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir task_names = args.target_columns or get_task_names(args.data_path) # Run training on different random seeds for each fold all_scores = [] if args.seeds and (len(args.seeds) != args.num_folds): raise ValueError( "Num seeds provided must match the number of folds required") for fold_num in range(args.num_folds): info(f'Fold {fold_num}') if args.seeds: args.seed = args.seeds[fold_num] else: args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = run_training(args, logger) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) if args.show_individual_scores: for task_name, score in zip(task_names, scores): info( f'Seed {init_seed + fold_num} ==> test {task_name} {args.metric} = {score:.6f}' ) # Report scores across models avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') wandb.log({ 'test_{}_mean'.format(args.metric): mean_score, 'test_{}_std'.format(args.metric): std_score }) # save results save_results(all_scores, [], task_names, args) if args.show_individual_scores: for task_num, task_name in enumerate(task_names): info( f'Overall test {task_name} {args.metric} = ' f'{np.nanmean(all_scores[:, task_num]):.6f} +/- {np.nanstd(all_scores[:, task_num]):.6f}' ) return mean_score, std_score
import pandas as pd import glob import os from chemprop.args import TrainArgs from chemprop.train import cross_validate from chemprop.utils import create_logger csvs = glob.glob(os.path.join('../data/tmprss2_meyer_et_al/', '*.csv')) raw_data = pd.concat((pd.read_csv(f) for f in csvs)) chemprop_data = raw_data[['SMILES', 'Activity']] chemprop_data.to_csv('chemprop_in.csv', index=False) # argument passing pretty janky but it's set up to use command line args = TrainArgs().parse_args(['--data_path', 'chemprop_in.csv', '--dataset_type', 'regression', '--save_dir', 'models']) logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) cross_validate(args, logger)
def cross_validate( args: TrainArgs, train_func: Callable[[TrainArgs, MoleculeDataset, Logger], Dict[str, List[float]]] ) -> Tuple[float, float]: """ Runs k-fold cross-validation. For each of k splits (folds) of the data, trains and tests a model on that split and aggregates the performance across folds. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param train_func: Function which runs training. :return: A tuple containing the mean and standard deviation performance across folds. """ logger = create_logger(name=TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet) if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir args.task_names = get_task_names(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns, ignore_columns=args.ignore_columns) # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Get data debug('Loading data') data = get_data(path=args.data_path, args=args, logger=logger, skip_none_targets=True) validate_dataset_type(data, dataset_type=args.dataset_type) args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Run training on different random seeds for each fold all_scores = defaultdict(list) for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = train_func( args, deepcopy(data), logger) # deepcopy since data may be modified for metric, scores in model_scores.items(): all_scores[metric].append(scores) all_scores = dict(all_scores) # Convert scores to numpy arrays for metric, scores in all_scores.items(): all_scores[metric] = np.array(scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold for fold_num in range(args.num_folds): for metric, scores in all_scores.items(): info( f'\tSeed {init_seed + fold_num} ==> test {metric} = {np.nanmean(scores[fold_num]):.6f}' ) if args.show_individual_scores: for task_name, score in zip(args.task_names, scores[fold_num]): info( f'\t\tSeed {init_seed + fold_num} ==> test {task_name} {metric} = {score:.6f}' ) # Report scores across folds for metric, scores in all_scores.items(): avg_scores = np.nanmean( scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(args.task_names): info( f'\tOverall test {task_name} {metric} = ' f'{np.nanmean(scores[:, task_num]):.6f} +/- {np.nanstd(scores[:, task_num]):.6f}' ) # Save scores with open(os.path.join(save_dir, TEST_SCORES_FILE_NAME), 'w') as f: writer = csv.writer(f) header = ['Task'] for metric in args.metrics: header += [f'Mean {metric}', f'Standard deviation {metric}'] + \ [f'Fold {i} {metric}' for i in range(args.num_folds)] writer.writerow(header) for task_num, task_name in enumerate(args.task_names): row = [task_name] for metric, scores in all_scores.items(): task_scores = scores[:, task_num] mean, std = np.nanmean(task_scores), np.nanstd(task_scores) row += [mean, std] + task_scores.tolist() writer.writerow(row) # Determine mean and std score of main metric avg_scores = np.nanmean(all_scores[args.metric], axis=1) mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) # Optionally merge and save test preds if args.save_preds: all_preds = pd.concat([ pd.read_csv( os.path.join(save_dir, f'fold_{fold_num}', 'test_preds.csv')) for fold_num in range(args.num_folds) ]) all_preds.to_csv(os.path.join(save_dir, 'test_preds.csv'), index=False) return mean_score, std_score
def cross_validate(args: TrainArgs, logger: Logger = None) -> Tuple[float, float]: """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir args.task_names = get_task_names(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns, ignore_columns=args.ignore_columns) # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = run_training(args, logger) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) if args.show_individual_scores: for task_name, score in zip(args.task_names, scores): info( f'Seed {init_seed + fold_num} ==> test {task_name} {args.metric} = {score:.6f}' ) # Report scores across models avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(args.task_names): info( f'Overall test {task_name} {args.metric} = ' f'{np.nanmean(all_scores[:, task_num]):.6f} +/- {np.nanstd(all_scores[:, task_num]):.6f}' ) # Save scores with open(os.path.join(save_dir, 'test_scores.csv'), 'w') as f: writer = csv.writer(f) writer.writerow([ 'Task', f'Mean {args.metric}', f'Standard deviation {args.metric}' ] + [f'Fold {i} {args.metric}' for i in range(args.num_folds)]) for task_num, task_name in enumerate(args.task_names): task_scores = all_scores[:, task_num] mean, std = np.nanmean(task_scores), np.nanstd(task_scores) writer.writerow([task_name, mean, std] + task_scores.tolist()) return mean_score, std_score
def chemprop_train() -> None: """Parses Chemprop training arguments and trains (cross-validates) a Chemprop model. This is the entry point for the command line command :code:`chemprop_train`. """ cross_validate(args=TrainArgs().parse_args(), train_func=run_training)
import os import torch # checks print('working directory is:') print(os.getcwd()) print('is CUDA available?') print(torch.cuda.is_available()) # imports from chemprop.args import TrainArgs from chemprop.train.new_noise import new_noise # instantiate args class and load from dict args = TrainArgs() args.from_dict({ 'dataset_type': 'regression', 'data_path': '/Users/georgelamb/Documents/GitHub/chempropBayes/data/qm9.csv' }) ##################### ARGS ##################### # architecture args.hidden_size = 500 args.depth = 5 args.ffn_num_layers = 3 args.activation = 'ReLU' args.ffn_hidden_size = args.hidden_size
import os import torch # checks print('working directory is:') print(os.getcwd()) print('is CUDA available?') print(torch.cuda.is_available()) # imports from chemprop.args import TrainArgs from chemprop.train.run_training import run_training # instantiate args class and load from dict args = TrainArgs() args.from_dict({ 'dataset_type': 'regression', 'data_path': '/home/willlamb/chempropBayes/data/qm9.csv' }) ##################### ARGS ##################### # architecture args.hidden_size = 500 args.depth = 5 args.ffn_num_layers = 3 args.activation = 'ReLU' args.ffn_hidden_size = args.hidden_size
def update_prediction_args( predict_args: PredictArgs, train_args: TrainArgs, missing_to_defaults: bool = True, validate_feature_sources: bool = True, ) -> None: """ Updates prediction arguments with training arguments loaded from a checkpoint file. If an argument is present in both, the prediction argument will be used. Also raises errors for situations where the prediction arguments and training arguments are different but must match for proper function. :param predict_args: The :class:`~chemprop.args.PredictArgs` object containing the arguments to use for making predictions. :param train_args: The :class:`~chemprop.args.TrainArgs` object containing the arguments used to train the model previously. :param missing_to_defaults: Whether to replace missing training arguments with the current defaults for :class: `~chemprop.args.TrainArgs`. This is used for backwards compatibility. :param validate_feature_sources: Indicates whether the feature sources (from path or generator) are checked for consistency between the training and prediction arguments. This is not necessary for fingerprint generation, where molecule features are not used. """ for key, value in vars(train_args).items(): if not hasattr(predict_args, key): setattr(predict_args, key, value) if missing_to_defaults: # If a default argument would cause different behavior than occurred in legacy checkpoints before the argument existed, # then that argument must be included in the `override_defaults` dictionary to force the legacy behavior. override_defaults = { "bond_features_scaling": False, "no_bond_features_scaling": True, "atom_descriptors_scaling": False, "no_atom_descriptors_scaling": True, } default_train_args = TrainArgs().parse_args([ "--data_path", None, "--dataset_type", str(train_args.dataset_type) ]) for key, value in vars(default_train_args).items(): if not hasattr(predict_args, key): setattr(predict_args, key, override_defaults.get(key, value)) # Same number of molecules must be used in training as in making predictions if train_args.number_of_molecules != predict_args.number_of_molecules and not ( isinstance(predict_args, FingerprintArgs) and predict_args.fingerprint_type == "MPN" and predict_args.mpn_shared and predict_args.number_of_molecules == 1): raise ValueError( "A different number of molecules was used in training " "model than is specified for prediction. This is only supported for models with shared MPN networks" f"and a fingerprint type of MPN. {train_args.number_of_molecules} smiles fields must be provided." ) # If atom-descriptors were used during training, they must be used when predicting and vice-versa if train_args.atom_descriptors != predict_args.atom_descriptors: raise ValueError( "The use of atom descriptors is inconsistent between training and prediction. If atom descriptors " " were used during training, they must be specified again during prediction using the same type of " " descriptors as before. If they were not used during training, they cannot be specified during prediction." ) # If bond features were used during training, they must be used when predicting and vice-versa if (train_args.bond_features_path is None) != (predict_args.bond_features_path is None): raise ValueError( "The use of bond descriptors is different between training and prediction. If you used bond " "descriptors for training, please specify a path to new bond descriptors for prediction." ) # if atom or bond features were scaled, the same must be done during prediction if train_args.features_scaling != predict_args.features_scaling: raise ValueError( "If scaling of the additional features was done during training, the " "same must be done during prediction.") # If atom descriptors were used during training, they must be used when predicting and vice-versa if train_args.atom_descriptors != predict_args.atom_descriptors: raise ValueError( "The use of atom descriptors is inconsistent between training and prediction. " "If atom descriptors were used during training, they must be specified again " "during prediction using the same type of descriptors as before. " "If they were not used during training, they cannot be specified during prediction." ) # If bond features were used during training, they must be used when predicting and vice-versa if (train_args.bond_features_path is None) != (predict_args.bond_features_path is None): raise ValueError( "The use of bond descriptors is different between training and prediction. If you used bond" "descriptors for training, please specify a path to new bond descriptors for prediction." ) # If features were used during training, they must be used when predicting if validate_feature_sources: if ((train_args.features_path is None) != (predict_args.features_path is None)) or ( (train_args.features_generator is None) != (predict_args.features_generator is None)): raise ValueError( "Features were used during training so they must be specified again during " "prediction using the same type of features as before " "(with either --features_generator or --features_path " "and using --no_features_scaling if applicable).")
def new_noise(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ debug = info = print # Get data args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() # Split data debug(f'Splitting data with seed {args.seed}') train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = neg_log_like metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) ########################################### ########## Outer loop over ensemble members ########################################### for model_idx in range(args.ensemble_start_idx, args.ensemble_start_idx + args.ensemble_size): # load the model if (args.method == 'map') or (args.method == 'swag') or (args.method == 'sgld'): model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model.pt', device=args.device, logger=logger) if args.method == 'gp': args.num_inducing_points = 1200 fake_model = MoleculeModel(args) fake_model.featurizer = True feature_extractor = fake_model inducing_points = initial_inducing_points(train_data_loader, feature_extractor, args) gp_layer = GPLayer(inducing_points, args.num_tasks) model = load_checkpoint( args.checkpoint_path + f'/model_{model_idx}/DKN_model.pt', device=args.device, logger=None, template=DKLMoleculeModel(MoleculeModel(args, featurizer=True), gp_layer)) if args.method == 'dropR' or args.method == 'dropA': model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model.pt', device=args.device, logger=logger) if args.method == 'bbp': template = MoleculeModelBBP(args) for layer in template.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) for layer in template.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model_bbp.pt', device=args.device, logger=None, template=template) if args.method == 'dun': args.prior_sig_dun = 0.05 args.depth_min = 1 args.depth_max = 5 args.rho_min_dun = -5.5 args.rho_max_dun = -5 args.log_cat_init = 0 template = MoleculeModelDUN(args) for layer in template.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) for layer in template.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) template.create_log_cat(args) model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model_dun.pt', device=args.device, logger=None, template=template) # make results_dir results_dir = os.path.join(args.results_dir, f'model_{model_idx}') makedirs(results_dir) # train_preds, train_targets train_preds = predict(model=model, data_loader=train_data_loader, args=args, scaler=scaler, test_data=False, bbp_sample=False) train_preds = np.array(train_preds) train_targets = np.array(train_targets) # compute tstats tstats = np.ones((12, 3)) for task in range(12): resid = train_preds[:, task] - train_targets[:, task] tstats[task] = np.array(stats.t.fit(resid, floc=0.0)) ################################## ########## Inner loop over samples ################################## for sample_idx in range(args.samples): # save down np.savez(os.path.join(results_dir, f'tstats_{sample_idx}'), tstats) print('done one')
def train(): """Renders the train page and performs training if request method is POST.""" global PROGRESS, TRAINING warnings, errors = [], [] if request.method == 'GET': return render_train() # Get arguments data_name, epochs, ensemble_size, checkpoint_name = \ request.form['dataName'], int(request.form['epochs']), \ int(request.form['ensembleSize']), request.form['checkpointName'] gpu = request.form.get('gpu') data_path = os.path.join(app.config['DATA_FOLDER'], f'{data_name}.csv') dataset_type = request.form.get('datasetType', 'regression') # Create and modify args args = TrainArgs().parse_args([ '--data_path', data_path, '--dataset_type', dataset_type, '--epochs', str(epochs), '--ensemble_size', str(ensemble_size) ]) # Check if regression/classification selection matches data data = get_data(path=data_path) targets = data.targets() unique_targets = { target for row in targets for target in row if target is not None } if dataset_type == 'classification' and len(unique_targets - {0, 1}) > 0: errors.append( 'Selected classification dataset but not all labels are 0 or 1. Select regression instead.' ) return render_train(warnings=warnings, errors=errors) if dataset_type == 'regression' and unique_targets <= {0, 1}: errors.append( 'Selected regression dataset but all labels are 0 or 1. Select classification instead.' ) return render_train(warnings=warnings, errors=errors) if gpu is not None: if gpu == 'None': args.cuda = False else: args.gpu = int(gpu) current_user = request.cookies.get('currentUser') if not current_user: # Use DEFAULT as current user if the client's cookie is not set. current_user = app.config['DEFAULT_USER_ID'] ckpt_id, ckpt_name = db.insert_ckpt(checkpoint_name, current_user, args.dataset_type, args.epochs, args.ensemble_size, len(targets)) with TemporaryDirectory() as temp_dir: args.save_dir = temp_dir process = mp.Process(target=progress_bar, args=(args, PROGRESS)) process.start() TRAINING = 1 # Run training logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) task_scores = run_training(args, logger) process.join() # Reset globals TRAINING = 0 PROGRESS = mp.Value('d', 0.0) # Check if name overlap if checkpoint_name != ckpt_name: warnings.append( name_already_exists_message('Checkpoint', checkpoint_name, ckpt_name)) # Move models for root, _, files in os.walk(args.save_dir): for fname in files: if fname.endswith('.pt'): model_id = db.insert_model(ckpt_id) save_path = os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model_id}.pt') shutil.move(os.path.join(args.save_dir, root, fname), save_path) return render_train(trained=True, metric=args.metric, num_tasks=len(args.task_names), task_names=args.task_names, task_scores=format_float_list(task_scores), mean_score=format_float(np.mean(task_scores)), warnings=warnings, errors=errors)
"""Trains a model on a dataset.""" from chemprop.args import TrainArgs from chemprop.train import meta_cross_validate from chemprop.utils import create_logger if __name__ == '__main__': args = TrainArgs().parse_args() logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) meta_cross_validate(args, logger)
def run_training_gnn_xgb(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Get data debug('Loading data') args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits(train_data=train_data, val_data=val_data, test_data=test_data, data_path=args.data_path, save_dir=args.save_dir) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation val_smiles, val_targets = val_data.smiles(), val_data.targets() test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=logger) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), device=args.device, logger=logger) test_preds, _ = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) _, train_feature = predict(model=model, data_loader=train_data_loader, scaler=scaler) _, val_feature = predict(model=model, data_loader=val_data_loader, scaler=scaler) _, test_feature = predict(model=model, data_loader=test_data_loader, scaler=scaler) return ensemble_scores, train_feature, val_feature, test_feature, train_targets, val_targets, test_targets
import os import torch # checks print('working directory is:') print(os.getcwd()) print('is CUDA available?') print(torch.cuda.is_available()) # imports from chemprop.args import TrainArgs from chemprop.train.pdts import pdts # instantiate args class and load from dict args = TrainArgs() args.from_dict({ 'dataset_type': 'regression', 'data_path': '/home/willlamb/chempropBayes/data/qm9.csv' }) ##################### ARGS ##################### # architecture args.hidden_size = 500 args.depth = 5 args.ffn_num_layers = 3 args.activation = 'ReLU' args.ffn_hidden_size = args.hidden_size args.features_path = None args.features_generator = None args.atom_messages = False