def get_hyperopt_seed(seed: int, dir_path: str) -> int: """ Assigns a seed for hyperopt calculations. Each iteration will start with a different seed. :param seed: The initial attempted hyperopt seed. :param dir_path: Path to the directory containing hyperopt checkpoint files. :return: An integer for use as hyperopt random seed. """ seed_path = os.path.join(dir_path, HYPEROPT_SEED_FILE_NAME) seeds = [] if os.path.exists(seed_path): with open(seed_path, 'r') as f: seed_line = next(f) seeds.extend(seed_line.split()) else: makedirs(seed_path, isfile=True) seeds = [int(sd) for sd in seeds] while seed in seeds: seed += 1 seeds.append(seed) write_line = " ".join(map(str, seeds)) + '\n' with open(seed_path, 'w') as f: f.write(write_line) return seed
def cross_validate_sklearn(args: SklearnTrainArgs, logger: Logger = None) -> Tuple[float, float]: info = logger.info if logger is not None else print init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = run_sklearn(args, logger) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) # Report scores across folds avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') return mean_score, std_score
def run_split_data(args: Args): # Load raw data with open(args.data_path) as f: reader = csv.reader(f) header = next(reader) lines = list(reader) # Load SMILES smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column) # Make sure lines and smiles line up assert len(lines) == len(smiles) assert all(smile in line for smile, line in zip(smiles, lines)) # Create data data = [] for smile, line in tqdm(zip(smiles, lines), total=len(smiles)): datapoint = MoleculeDatapoint(smiles=smile) datapoint.line = line data.append(datapoint) data = MoleculeDataset(data) train, val, test = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed) makedirs(args.save_dir) for name, dataset in [('train', train), ('val', val), ('test', test)]: with open(os.path.join(args.save_dir, f'{name}.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for datapoint in dataset: writer.writerow(datapoint.line)
def run_split_data(data_path: str, split_type: str, split_sizes: Tuple[int, int, int], seed: int, save_dir: str): with open(data_path) as f: reader = csv.reader(f) header = next(reader) lines = list(reader) data = [] for line in tqdm(lines): datapoint = MoleculeDatapoint(line=line) datapoint.line = line data.append(datapoint) data = MoleculeDataset(data) train, dev, test = split_data(data=data, split_type=split_type, sizes=split_sizes, seed=seed) makedirs(save_dir) for name, dataset in [('train', train), ('dev', dev), ('test', test)]: with open(os.path.join(save_dir, f'{name}.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for datapoint in dataset: writer.writerow(datapoint.line)
def modify_predict_args(args: Namespace): """ Modifies and validates predicting args in place. :param args: Arguments. """ # Load config file if args.config_path is not None: with open(args.config_path) as f: config = json.load(f) for key, value in config.items(): setattr(args, key, value) assert not args.use_compound_names # not supported assert args.test_path assert args.preds_path assert args.checkpoint_dir is not None or args.checkpoint_path is not None or args.checkpoint_paths is not None update_checkpoint_args(args) args.cuda = not args.no_cuda and torch.cuda.is_available() del args.no_cuda # Create directory for preds path makedirs(args.preds_path, isfile=True)
def cross_validate_mechine(args: TrainArgs, logger: Logger = None): """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold dmpnn_scores = [] for fold_num in range(args.num_folds): if args.dataset_type == 'classification': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/test.csv' elif args.dataset_type == 'regression': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/test.csv' info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores,model,scaler,df = run_training(args, logger) if args.loss_save: df.to_csv('/home/cxw/python——work/paper_gcn/dmpnn_epoch_loss/'+args.protein+'loss.csv',index=None) # df.to_csv(args.protein+'loss.csv',index=None) break dmpnn_scores.append(model_scores) train_target, train_feature, val_target, val_feature, test_target, test_feature,train_smiles,val_smiles,test_smiles,test_preds = get_xgboost_feature(args, logger,model) train_target = pd.DataFrame(train_target) train_feature = pd.DataFrame(train_feature) val_target = pd.DataFrame(val_target) val_feature = pd.DataFrame(val_feature) test_target = pd.DataFrame(test_target) test_feature = pd.DataFrame(test_feature) train_morgan_feature = get_morgan_feature(train_smiles) val_morgan_feature = get_morgan_feature(val_smiles) test_morgan_feature = get_morgan_feature(test_smiles) if args.dataset_type == 'classification': if test_target.shape[1]==1: scores = svm_knn_rf_class(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) else: scores = svm_knn_rf_class_more(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) scores.columns = ['type','auc'] elif args.dataset_type == 'regression': if test_target.shape[1]==1: scores = svm_knn_rf_regre(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) else: scores = svm_knn_rf_regre_more(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) scores.columns = ['type', 'RMSE'] scores.to_csv(args.protein+'mechine_scores.csv')
def predict_sklearn(args: SklearnPredictArgs): print('Loading data') data = get_data(path=args.test_path, smiles_column=args.smiles_column, target_columns=[]) print('Loading training arguments') with open(args.checkpoint_paths[0], 'rb') as f: model = pickle.load(f) train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict( model.train_args, skip_unsettable=True) print('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for datapoint in tqdm(data, total=len(data)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=train_args.radius, num_bits=train_args.num_bits)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') sum_preds = np.zeros((len(data), train_args.num_tasks)) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): with open(checkpoint_path, 'rb') as f: model = pickle.load(f) model_preds = predict(model=model, model_type=train_args.model_type, dataset_type=train_args.dataset_type, features=data.features()) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() print(f'Saving predictions to {args.preds_path}') assert len(data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Copy predictions over to data for datapoint, preds in zip(data, avg_preds): for pred_name, pred in zip(train_args.task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=data[0].row.keys()) writer.writeheader() for datapoint in data: writer.writerow(datapoint.row)
def cross_validate(args: TrainArgs, logger: Logger = None) -> Tuple[float, float]: """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir task_names = args.target_columns or get_task_names(args.data_path) # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores, uncertainty_estimator = run_training(args, logger) # Save one model for each fold if uncertainty_estimator: process_estimator(uncertainty_estimator, logger, args, fold_num) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) if args.show_individual_scores: for task_name, score in zip(task_names, scores): info( f'Seed {init_seed + fold_num} ==> test {task_name} {args.metric} = {score:.6f}' ) # Report scores across models avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(task_names): info( f'Overall test {task_name} {args.metric} = ' f'{np.nanmean(all_scores[:, task_num]):.6f} +/- {np.nanstd(all_scores[:, task_num]):.6f}' ) return mean_score, std_score
def modify_predict_args(args: Namespace): """ Modifies and validates predicting args in place. :param args: Arguments. """ assert args.test_path assert args.preds_path assert args.checkpoint_dir is not None or args.checkpoint_path is not None or args.checkpoint_paths is not None update_checkpoint_args(args) args.cuda = not args.no_cuda and torch.cuda.is_available() del args.no_cuda # Create directory for preds path makedirs(args.preds_path, isfile=True)
def cross_validate_sklearn(args: SklearnTrainArgs) -> Tuple[float, float]: """ Runs k-fold cross-validation for a scikit-learn model. For each of k splits (folds) of the data, trains and tests a model on that split and aggregates the performance across folds. :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for loading data and training the scikit-learn model. :return: A tuple containing the mean and standard deviation performance across folds. """ logger = create_logger(name=SKLEARN_TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet) info = logger.info if logger is not None else print init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = run_sklearn(args, logger) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) # Report scores across folds avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') return mean_score, std_score
def save_similar_mols(test_path: str, train_path: str, save_path: str, distance_measure: str, checkpoint_path: str = None, num_neighbors: int = None, batch_size: int = 50, smiles_column: str = None): """ For each test molecule, finds the N most similar training molecules according to some distance measure. Loads molecules and model from file and saves results to file. :param test_path: Path to a CSV file containing test SMILES. :param train_path: Path to a CSV file containing train SMILES. :param checkpoint_path: Path to a .pt model checkpoint file (only needed for distance_measure == 'embedding'). :param save_path: Path to a CSV file where the results will be saved. :param distance_measure: The distance measure to use to determine nearest neighbors. :param num_neighbors: The number of nearest training molecules to find for each test molecule. :param batch_size: Batch size. :return: A list of OrderedDicts containing the test smiles, the num_neighbors nearest training smiles, and other relevant distance info. """ # Find similar molecules similar_mols = find_similar_mols_from_file( test_path=test_path, train_path=train_path, checkpoint_path=checkpoint_path, distance_measure=distance_measure, num_neighbors=num_neighbors, batch_size=batch_size, smiles_column=smiles_column, ) # Save results makedirs(save_path, isfile=True) with open(save_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=similar_mols[0].keys()) writer.writeheader() for row in similar_mols: writer.writerow(row)
def cross_validate(args: Namespace, logger: Logger=None) \ -> Tuple[float, float]: """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables: init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold: all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) all_scores.append(run_training(args, logger)) _report(all_scores, args.data_df.columns, init_seed, args.metric, info)
def predict_sklearn(args: Namespace): print('Loading data') data = get_data(path=args.test_path) print('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for datapoint in tqdm(data, total=len(data)): datapoint.set_features(morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') sum_preds = np.zeros((len(data), args.num_tasks)) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): with open(checkpoint_path, 'rb') as f: model = pickle.load(f) model_preds = predict( model=model, model_type=args.model_type, dataset_type=args.dataset_type, features=data.features() ) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() print('Saving predictions') assert len(data) == len(avg_preds) makedirs(args.preds_path, isfile=True) with open(args.preds_path, 'w') as f: writer = csv.writer(f) writer.writerow(['smiles'] + get_task_names(args.test_path)) for smiles, pred in zip(data.smiles(), avg_preds): writer.writerow([smiles] + pred)
def molecule_fingerprint( args: FingerprintArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to encode fingerprint vectors for the data. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of fingerprint vectors (list of floats) """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments if args.fingerprint_type == 'MPN': # only need to supply input features if using FFN latent representation and if model calls for them. validate_feature_sources = False else: validate_feature_sources = True update_prediction_args(predict_args=args, train_args=train_args, validate_feature_sources=validate_feature_sources) args: Union[FingerprintArgs, TrainArgs] #set explicit H option and reaction option reset_featurization_parameters() if args.atom_descriptors == 'feature': set_extra_atom_fdim(train_args.atom_features_size) if args.bond_features_path is not None: set_extra_bond_fdim(train_args.bond_features_size) set_explicit_h(train_args.explicit_h) set_adding_hs(args.adding_h) if train_args.reaction: set_reaction(train_args.reaction, train_args.reaction_mode) elif train_args.reaction_solvent: set_reaction(True, train_args.reaction_mode) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Set fingerprint size if args.fingerprint_type == 'MPN': if args.atom_descriptors == "descriptor": # special case when we have 'descriptor' extra dimensions need to be added total_fp_size = ( args.hidden_size + test_data.atom_descriptors_size()) * args.number_of_molecules else: if args.reaction_solvent: total_fp_size = args.hidden_size + args.hidden_size_solvent else: total_fp_size = args.hidden_size * args.number_of_molecules if args.features_only: raise ValueError( 'With features_only models, there is no latent MPN representation. Use last_FFN fingerprint type instead.' ) elif args.fingerprint_type == 'last_FFN': if args.ffn_num_layers != 1: total_fp_size = args.ffn_hidden_size else: raise ValueError( 'With a ffn_num_layers of 1, there is no latent FFN representation. Use MPN fingerprint type instead.' ) else: raise ValueError( f'Fingerprint type {args.fingerprint_type} not supported') all_fingerprints = np.zeros( (len(test_data), total_fp_size, len(args.checkpoint_paths))) # Load model print( f'Encoding smiles into a fingerprint vector from {len(args.checkpoint_paths)} models.' ) for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( args.checkpoint_paths[index]) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make fingerprints model_fp = model_fingerprint(model=model, data_loader=test_data_loader, fingerprint_type=args.fingerprint_type) if args.fingerprint_type == 'MPN' and ( args.features_path is not None or args.features_generator ): # truncate any features from MPN fingerprint model_fp = np.array(model_fp)[:, :total_fp_size] all_fingerprints[:, :, index] = model_fp # Save predictions print(f'Saving predictions to {args.preds_path}') # assert len(test_data) == len(all_fingerprints) #TODO: add unit test for this makedirs(args.preds_path, isfile=True) # Set column names fingerprint_columns = [] if args.fingerprint_type == 'MPN': if len(args.checkpoint_paths) == 1: for j in range(total_fp_size // args.number_of_molecules): for k in range(args.number_of_molecules): fingerprint_columns.append(f'fp_{j}_mol_{k}') else: for j in range(total_fp_size // args.number_of_molecules): for i in range(len(args.checkpoint_paths)): for k in range(args.number_of_molecules): fingerprint_columns.append(f'fp_{j}_mol_{k}_model_{i}') else: # args == 'last_FNN' if len(args.checkpoint_paths) == 1: for j in range(total_fp_size): fingerprint_columns.append(f'fp_{j}') else: for j in range(total_fp_size): for i in range(len(args.checkpoint_paths)): fingerprint_columns.append(f'fp_{j}_model_{i}') # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = all_fingerprints[valid_index].reshape( (len(args.checkpoint_paths) * total_fp_size )) if valid_index is not None else ['Invalid SMILES'] * len( args.checkpoint_paths) * total_fp_size for i in range(len(fingerprint_columns)): datapoint.row[fingerprint_columns[i]] = preds[i] # Write predictions with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=args.smiles_columns + fingerprint_columns, extrasaction='ignore') writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return all_fingerprints
def run_training(args: TrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`args.metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data( path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, atom_descriptors_path=args.separate_test_atom_descriptors_path, bond_features_path=args.separate_test_bond_features_path, phase_features_path=args.separate_test_phase_features_path, smiles_columns=args.smiles_columns, logger=logger) if args.separate_val_path: val_data = get_data( path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, atom_descriptors_path=args.separate_val_atom_descriptors_path, bond_features_path=args.separate_val_bond_features_path, phase_features_path=args.separate_val_phase_features_path, smiles_columns=args.smiles_columns, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits( data_path=args.data_path, save_dir=args.save_dir, task_names=args.task_names, features_path=args.features_path, train_data=train_data, val_data=val_data, test_data=test_data, smiles_columns=args.smiles_columns, logger=logger, ) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None if args.atom_descriptor_scaling and args.atom_descriptors is not None: atom_descriptor_scaler = train_data.normalize_features( replace_nan_token=0, scale_atom_descriptors=True) val_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) else: atom_descriptor_scaler = None if args.bond_feature_scaling and args.bond_features_size > 0: bond_feature_scaler = train_data.normalize_features( replace_nan_token=0, scale_bond_features=True) val_data.normalize_features(bond_feature_scaler, scale_bond_features=True) test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) else: bond_feature_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') scaler = train_data.normalize_targets() elif args.dataset_type == 'spectra': debug( 'Normalizing spectra and excluding spectra regions based on phase') args.spectra_phase_mask = load_phase_mask(args.spectra_phase_mask_path) for dataset in [train_data, test_data, val_data]: data_targets = normalize_spectra( spectra=dataset.targets(), phase_features=dataset.phase_features(), phase_mask=args.spectra_phase_mask, excluded_sub_value=None, threshold=args.spectra_target_floor, ) dataset.set_targets(data_targets) scaler = None else: scaler = None # Get loss function loss_func = get_loss_func(args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: set_cache_graph(True) num_workers = 0 else: set_cache_graph(False) num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers) if args.class_balance: debug( f'With class_balance, effective train size = {train_data_loader.iter_size:,}' ) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) # Optionally, overwrite weights: if args.checkpoint_frzn is not None: debug( f'Loading and freezing parameters from {args.checkpoint_frzn}.' ) model = load_frzn_model(model=model, path=args.checkpoint_frzn, current_args=args, logger=logger) debug(model) if args.checkpoint_frzn is not None: debug(f'Number of unfrozen parameters = {param_count(model):,}') debug(f'Total number of parameters = {param_count_all(model):,}') else: debug(f'Number of parameters = {param_count_all(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, scaler=scaler, logger=logger) for metric, scores in val_scores.items(): # Average validation score avg_val_score = np.nanmean(scores) debug(f'Validation {metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, scores): debug( f'Validation {task_name} {metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{metric}', val_score, n_iter) # Save model checkpoint if improved validation score avg_val_score = np.nanmean(val_scores[args.metric]) if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args) # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score for metric, scores in test_scores.items(): avg_test_score = np.nanmean(scores) info(f'Model {model_idx} test {metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{metric}', avg_test_score, 0) if args.show_individual_scores and args.dataset_type != 'spectra': # Individual test scores for task_name, test_score in zip(args.task_names, scores): info( f'Model {model_idx} test {task_name} {metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{metric}', test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger) for metric, scores in ensemble_scores.items(): # Average ensemble score avg_ensemble_test_score = np.nanmean(scores) info(f'Ensemble test {metric} = {avg_ensemble_test_score:.6f}') # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, scores): info( f'Ensemble test {task_name} {metric} = {ensemble_score:.6f}' ) # Save scores with open(os.path.join(args.save_dir, 'test_scores.json'), 'w') as f: json.dump(ensemble_scores, f, indent=4, sort_keys=True) # Optionally save test preds if args.save_preds: test_preds_dataframe = pd.DataFrame( data={'smiles': test_data.smiles()}) for i, task_name in enumerate(args.task_names): test_preds_dataframe[task_name] = [ pred[i] for pred in avg_test_preds ] test_preds_dataframe.to_csv(os.path.join(args.save_dir, 'test_preds.csv'), index=False) return ensemble_scores
def generate_and_save_features(args: Namespace): """ Computes and saves features for a dataset of molecules as a 2D array in a .npz file. :param args: Arguments. """ # Create directory for save_path makedirs(args.save_path, isfile=True) # Get data and features function data = get_data(path=args.data_path, max_data_size=None) features_generator = get_features_generator(args.features_generator) temp_save_dir = args.save_path + '_temp' # Load partially complete data if args.restart: if os.path.exists(args.save_path): os.remove(args.save_path) if os.path.exists(temp_save_dir): shutil.rmtree(temp_save_dir) else: if os.path.exists(args.save_path): raise ValueError( f'"{args.save_path}" already exists and args.restart is False.' ) if os.path.exists(temp_save_dir): features, temp_num = load_temp(temp_save_dir) if not os.path.exists(temp_save_dir): makedirs(temp_save_dir) features, temp_num = [], 0 # Build features map function data = data[len( features ):] # restrict to data for which features have not been computed yet mols = (d.mol for d in data) if args.sequential: features_map = map(features_generator, mols) else: features_map = Pool().imap(features_generator, mols) # Get features temp_features = [] for i, feats in tqdm(enumerate(features_map), total=len(data)): temp_features.append(feats) # Save temporary features every save_frequency if (i > 0 and (i + 1) % args.save_frequency == 0) or i == len(data) - 1: save_features(os.path.join(temp_save_dir, f'{temp_num}.npz'), temp_features) features.extend(temp_features) temp_features = [] temp_num += 1 try: # Save all features save_features(args.save_path, features) # Remove temporary features shutil.rmtree(temp_save_dir) except OverflowError: print( 'Features array is too large to save as a single file. Instead keeping features as a directory of files.' )
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print("Loading training args") train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names update_prediction_args(predict_args=args, train_args=train_args) args: Union[PredictArgs, TrainArgs] if args.atom_descriptors == "feature": set_extra_atom_fdim(train_args.atom_features_size) if args.bond_features_path is not None: set_extra_bond_fdim(train_args.bond_features_size) # set explicit H option and reaction option set_explicit_h(train_args.explicit_h) set_reaction(train_args.reaction, train_args.reaction_mode) print("Loading data") if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator, ) else: full_data = get_data( path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=not args.drop_extra_columns, ) print("Validating SMILES") full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f"Test size = {len(test_data):,}") # Predict with each model individually and sum predictions if args.dataset_type == "multiclass": sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader( dataset=test_data, batch_size=args.batch_size, num_workers=0 if sys.platform == "darwin" else args.num_workers, ) # Partial results for variance robust calculation. if args.ensemble_variance: all_preds = np.zeros( (len(test_data), num_tasks, len(args.checkpoint_paths))) print( f"Predicting with an ensemble of {len(args.checkpoint_paths)} models") for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) ( scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, ) = load_scalers(checkpoint_path) # Normalize features if (args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling): test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if (train_args.atom_descriptor_scaling and args.atom_descriptors is not None): test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) if args.ensemble_variance: all_preds[:, :, index] = model_preds # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() if args.ensemble_variance: all_epi_uncs = np.var(all_preds, axis=2) all_epi_uncs = all_epi_uncs.tolist() # Save predictions print(f"Saving predictions to {args.preds_path}") assert len(test_data) == len(avg_preds) if args.ensemble_variance: assert len(test_data) == len(all_epi_uncs) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == "multiclass": task_names = [ f"{name}_class_{i}" for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = (avg_preds[valid_index] if valid_index is not None else ["Invalid SMILES"] * len(task_names)) if args.ensemble_variance: epi_uncs = (all_epi_uncs[valid_index] if valid_index is not None else ["Invalid SMILES"] * len(task_names)) # If extra columns have been dropped, add back in SMILES columns if args.drop_extra_columns: datapoint.row = OrderedDict() smiles_columns = args.smiles_columns for column, smiles in zip(smiles_columns, datapoint.smiles): datapoint.row[column] = smiles # Add predictions columns if args.ensemble_variance: for pred_name, pred, epi_unc in zip(task_names, preds, epi_uncs): datapoint.row[pred_name] = pred datapoint.row[pred_name + "_epi_unc"] = epi_unc else: for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, "w") as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def compare_datasets_tsne(args: Args): if len(args.smiles_paths) > len(args.colors) or len( args.smiles_paths) > len(args.sizes): raise ValueError( 'Must have at least as many colors and sizes as datasets') # Random seed for random subsampling np.random.seed(0) # Load the smiles datasets print('Loading data') smiles, slices, labels = [], [], [] for smiles_path in args.smiles_paths: # Get label label = os.path.basename(smiles_path).replace('.csv', '') # Get SMILES new_smiles = get_smiles(path=smiles_path, smiles_columns=args.smiles_column, flatten=True) print(f'{label}: {len(new_smiles):,}') # Subsample if dataset is too large if len(new_smiles) > args.max_per_dataset: print(f'Subsampling to {args.max_per_dataset:,} molecules') new_smiles = np.random.choice(new_smiles, size=args.max_per_dataset, replace=False).tolist() slices.append(slice(len(smiles), len(smiles) + len(new_smiles))) labels.append(label) smiles += new_smiles # Compute Morgan fingerprints print('Computing Morgan fingerprints') morgan_generator = get_features_generator('morgan') morgans = [ morgan_generator(smile) for smile in tqdm(smiles, total=len(smiles)) ] print('Running t-SNE') start = time.time() tsne = TSNE(n_components=2, init='pca', random_state=0, metric='jaccard') X = tsne.fit_transform(morgans) print(f'time = {time.time() - start:.2f} seconds') if args.cluster: import hdbscan # pip install hdbscan print('Running HDBSCAN') start = time.time() clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True) colors = clusterer.fit_predict(X) print(f'time = {time.time() - start:.2f} seconds') print('Plotting t-SNE') x_min, x_max = np.min(X, axis=0), np.max(X, axis=0) X = (X - x_min) / (x_max - x_min) makedirs(args.save_path, isfile=True) plt.clf() fontsize = 50 * args.scale fig = plt.figure(figsize=(64 * args.scale, 48 * args.scale)) plt.title('t-SNE using Morgan fingerprint with Jaccard similarity', fontsize=2 * fontsize) ax = fig.gca() handles = [] legend_kwargs = dict(loc='upper right', fontsize=fontsize) if args.cluster: plt.scatter(X[:, 0], X[:, 1], s=150 * np.mean(args.sizes), c=colors, cmap='nipy_spectral') else: for slc, color, label, size in zip(slices, args.colors, labels, args.sizes): if args.plot_molecules: # Plots molecules handles.append(mpatches.Patch(color=color, label=label)) for smile, (x, y) in zip(smiles[slc], X[slc]): img = Draw.MolsToGridImage([Chem.MolFromSmiles(smile)], molsPerRow=1, subImgSize=(200, 200)) imagebox = offsetbox.AnnotationBbox( offsetbox.OffsetImage(img), (x, y), bboxprops=dict(color=color)) ax.add_artist(imagebox) else: # Plots points plt.scatter(X[slc, 0], X[slc, 1], s=150 * size, color=color, label=label) if args.plot_molecules: legend_kwargs['handles'] = handles plt.legend(**legend_kwargs) plt.xticks([]), plt.yticks([]) print('Saving t-SNE') plt.savefig(args.save_path)
def predict_and_save(args: PredictArgs, train_args: TrainArgs, test_data: MoleculeDataset, task_names: List[str], num_tasks: int, test_data_loader: MoleculeDataLoader, full_data: MoleculeDataset, full_to_valid_indices: dict, models: List[MoleculeModel], scalers: List[List[StandardScaler]], return_invalid_smiles: bool = False): """ Function to predict with a model and save the predictions to file. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param train_args: A :class:`~chemprop.args.TrainArgs` object containing arguments for training the model. :param test_data: A :class:`~chemprop.data.MoleculeDataset` containing valid datapoints. :param task_names: A list of task names. :param num_tasks: Number of tasks. :param test_data_loader: A :class:`~chemprop.data.MoleculeDataLoader` to load the test data. :param full_data: A :class:`~chemprop.data.MoleculeDataset` containing all (valid and invalid) datapoints. :param full_to_valid_indices: A dictionary dictionary mapping full to valid indices. :param models: A list or generator object of :class:`~chemprop.models.MoleculeModel`\ s. :param scalers: A list or generator object of :class:`~chemprop.features.scaler.StandardScaler` objects. :param return_invalid_smiles: Whether to return predictions of "Invalid SMILES" for invalid SMILES, otherwise will skip them in returned predictions. :return: A list of lists of target predictions. """ # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros((len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) if args.ensemble_variance or args.individual_ensemble_predictions: if args.dataset_type == 'multiclass': all_preds = np.zeros((len(test_data), num_tasks, args.multiclass_num_classes, len(args.checkpoint_paths))) else: all_preds = np.zeros((len(test_data), num_tasks, len(args.checkpoint_paths))) # Partial results for variance robust calculation. print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for index, (model, scaler_list) in enumerate(tqdm(zip(models, scalers), total=len(args.checkpoint_paths))): scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = scaler_list # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make predictions model_preds = predict( model=model, data_loader=test_data_loader, scaler=scaler ) if args.dataset_type == 'spectra': model_preds = normalize_spectra( spectra=model_preds, phase_features=test_data.phase_features(), phase_mask=args.spectra_phase_mask, excluded_sub_value=float('nan') ) sum_preds += np.array(model_preds) if args.ensemble_variance or args.individual_ensemble_predictions: if args.dataset_type == 'multiclass': all_preds[:,:,:,index] = model_preds else: all_preds[:,:,index] = model_preds # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) if args.ensemble_variance: if args.dataset_type == 'spectra': all_epi_uncs = roundrobin_sid(all_preds) else: all_epi_uncs = np.var(all_preds, axis=2) all_epi_uncs = all_epi_uncs.tolist() # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) if args.ensemble_variance: assert len(test_data) == len(all_epi_uncs) makedirs(args.preds_path, isfile=True) # Set multiclass column names, update num_tasks definition for multiclass if args.dataset_type == 'multiclass': task_names = [f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes)] num_tasks = num_tasks * args.multiclass_num_classes # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else ['Invalid SMILES'] * num_tasks if args.ensemble_variance: if args.dataset_type == 'spectra': epi_uncs = all_epi_uncs[valid_index] if valid_index is not None else ['Invalid SMILES'] else: epi_uncs = all_epi_uncs[valid_index] if valid_index is not None else ['Invalid SMILES'] * num_tasks if args.individual_ensemble_predictions: ind_preds = all_preds[valid_index] if valid_index is not None else [['Invalid SMILES'] * len(args.checkpoint_paths)] * num_tasks # Reshape multiclass to merge task and class dimension, with updated num_tasks if args.dataset_type == 'multiclass': if isinstance(preds, np.ndarray) and preds.ndim > 1: preds = preds.reshape((num_tasks)) if args.ensemble_variance or args. individual_ensemble_predictions: ind_preds = ind_preds.reshape((num_tasks, len(args.checkpoint_paths))) # If extra columns have been dropped, add back in SMILES columns if args.drop_extra_columns: datapoint.row = OrderedDict() smiles_columns = args.smiles_columns for column, smiles in zip(smiles_columns, datapoint.smiles): datapoint.row[column] = smiles # Add predictions columns for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred if args.individual_ensemble_predictions: for pred_name, model_preds in zip(task_names,ind_preds): for idx, pred in enumerate(model_preds): datapoint.row[pred_name+f'_model_{idx}'] = pred if args.ensemble_variance: if args.dataset_type == 'spectra': datapoint.row['epi_unc'] = epi_uncs else: for pred_name, epi_unc in zip(task_names, epi_uncs): datapoint.row[pred_name+'_epi_unc'] = epi_unc # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) # Return predicted values avg_preds = avg_preds.tolist() if return_invalid_smiles: full_preds = [] for full_index in range(len(full_data)): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else ['Invalid SMILES'] * num_tasks full_preds.append(preds) return full_preds else: return avg_preds
def grid_search(args: Namespace): # Create loggers logger = create_logger(name='hyperparameter_optimization', save_dir=args.log_dir, quiet=True) train_logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) # Run grid search results = [] # Define hyperparameter optimization def objective(hyperparams: Dict[str, Union[int, float]]) -> float: # Convert hyperparams from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Update args with hyperparams hyper_args = deepcopy(args) if args.save_dir is not None: folder_name = '_'.join(f'{key}_{value}' for key, value in hyperparams.items()) hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name) for key, value in hyperparams.items(): setattr(hyper_args, key, value) # Record hyperparameters logger.info(hyperparams) # Cross validate mean_score, std_score = cross_validate(hyper_args, train_logger) # Record results temp_model = build_model(hyper_args) num_params = param_count(temp_model) logger.info(f'num params: {num_params:,}') logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}') results.append({ 'mean_score': mean_score, 'std_score': std_score, 'hyperparams': hyperparams, 'num_params': num_params }) # Deal with nan if np.isnan(mean_score): if hyper_args.dataset_type == 'classification': mean_score = 0 else: raise ValueError('Can\'t handle nan score for non-classification dataset.') return (1 if hyper_args.minimize_score else -1) * mean_score fmin(objective, SPACE, algo=tpe.suggest, max_evals=args.num_iters) # Report best result results = [result for result in results if not np.isnan(result['mean_score'])] best_result = min(results, key=lambda result: (1 if args.minimize_score else -1) * result['mean_score']) logger.info('best') logger.info(best_result['hyperparams']) logger.info(f'num params: {best_result["num_params"]:,}') logger.info(f'{best_result["mean_score"]} +/- {best_result["std_score"]} {args.metric}') # Save best hyperparameter settings as JSON config file makedirs(args.config_save_path, isfile=True) with open(args.config_save_path, 'w') as f: json.dump(best_result['hyperparams'], f, indent=4, sort_keys=True)
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler = load_scalers(checkpoint_path) # Normalize features if args.features_scaling: test_data.reset_features_and_targets() test_data.normalize_features(features_scaler) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def predict_sklearn(args: SklearnPredictArgs) -> None: """ Loads data and a trained scikit-learn model and uses the model to make predictions on the data. :param args: A :class:`~chemprop.args.SklearnPredictArgs` object containing arguments for loading data, loading a trained scikit-learn model, and making predictions with the model. """ print('Loading data') data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], store_row=True) print('Loading training arguments') with open(args.checkpoint_paths[0], 'rb') as f: model = pickle.load(f) train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict( model.train_args, skip_unsettable=True) print('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for datapoint in tqdm(data, total=len(data)): for s in datapoint.smiles: datapoint.extend_features( morgan_fingerprint(mol=s, radius=train_args.radius, num_bits=train_args.num_bits)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') sum_preds = np.zeros((len(data), train_args.num_tasks)) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): with open(checkpoint_path, 'rb') as f: model = pickle.load(f) model_preds = predict(model=model, model_type=train_args.model_type, dataset_type=train_args.dataset_type, features=data.features()) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() print(f'Saving predictions to {args.preds_path}') # assert len(data) == len(avg_preds) #TODO: address with unit test later makedirs(args.preds_path, isfile=True) # Copy predictions over to data for datapoint, preds in zip(data, avg_preds): for pred_name, pred in zip(train_args.task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=data[0].row.keys()) writer.writeheader() for datapoint in data: writer.writerow(datapoint.row)
def modify_train_args(args: Namespace): """ Modifies and validates training arguments in place. :param args: Arguments. """ global temp_dir # Prevents the temporary directory from being deleted upon function return # Load config file if args.config_path is not None: with open(args.config_path) as f: config = json.load(f) for key, value in config.items(): setattr(args, key, value) assert args.data_path is not None assert args.dataset_type is not None if args.save_dir is not None: makedirs(args.save_dir) else: temp_dir = TemporaryDirectory() args.save_dir = temp_dir.name args.cuda = not args.no_cuda and torch.cuda.is_available() del args.no_cuda args.features_scaling = not args.no_features_scaling del args.no_features_scaling if args.metric is None: if args.dataset_type == 'classification': args.metric = 'auc' elif args.dataset_type == 'multiclass': args.metric = 'cross_entropy' else: args.metric = 'rmse' if not ((args.dataset_type == 'classification' and args.metric in ['auc', 'prc-auc', 'accuracy']) or (args.dataset_type == 'regression' and args.metric in ['rmse', 'mae', 'mse', 'r2']) or (args.dataset_type == 'multiclass' and args.metric in ['cross_entropy', 'accuracy'])): raise ValueError( f'Metric "{args.metric}" invalid for dataset type "{args.dataset_type}".' ) args.minimize_score = args.metric in [ 'rmse', 'mae', 'mse', 'cross_entropy' ] update_checkpoint_args(args) if args.features_only: assert args.features_generator or args.features_path args.use_input_features = args.features_generator or args.features_path if args.features_generator is not None and 'rdkit_2d_normalized' in args.features_generator: assert not args.features_scaling args.num_lrs = 1 if args.ffn_hidden_size is None: args.ffn_hidden_size = args.hidden_size assert (args.split_type == 'predetermined') == ( args.folds_file is not None) == (args.test_fold_index is not None) assert (args.split_type == 'crossval') == (args.crossval_index_dir is not None) assert (args.split_type in ['crossval', 'index_predetermined']) == (args.crossval_index_file is not None) if args.split_type in ['crossval', 'index_predetermined']: with open(args.crossval_index_file, 'rb') as rf: args.crossval_index_sets = pickle.load(rf) args.num_folds = len(args.crossval_index_sets) args.seed = 0 if args.test: args.epochs = 0
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] if args.parcel_size and args.max_data_size: num_iterations = math.ceil(args.max_data_size / args.parcel_size) max_data_size = args.parcel_size print('Using parcels: ' + str(num_iterations)) else: num_iterations = 1 max_data_size = args.max_data_size print('Not using parcels.') if args.parcel_offset: offset = args.parcel_offset * parcel_size else: offset = 0 for iteration in range(num_iterations): print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: print("Getting without SMILES") full_data = get_data(path=args.test_path, args=args, target_columns=[], max_data_size=max_data_size, data_offset=offset, skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models' ) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions if iteration != 0: name, ext = os.path.splitext(args.preds_path) preds_path = "{name}.{it}.csv".format(name=name, it=iteration) else: preds_path = args.preds_path print(f'Saving predictions to {preds_path}') assert len(test_data) == len(avg_preds) makedirs(preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred with open(preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) offset = offset + parcel_size return avg_preds
def run_training(args: Namespace, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set GPU if args.gpu is not None: torch.cuda.set_device(args.gpu) # Print args # ============================================================================= # debug(pformat(vars(args))) # ============================================================================= # Get data debug('Loading data') args.task_names = get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: print('=' * 100) train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) ###my_code### train_df = get_data_df(train_data) train_df.to_csv( '~/PycharmProjects/CMPNN-master/data/24w_train_df_seed0.csv') val_df = get_data_df(val_data) val_df.to_csv( '~/PycharmProjects/CMPNN-master/data/24w_val_df_seed0.csv') test_df = get_data_df(test_data) test_df.to_csv( '~/PycharmProjects/CMPNN-master/data/24w_test_df_seed0.csv') ########## if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: with open(args.data_path, 'r') as f: reader = csv.reader(f) header = next(reader) lines_by_smiles = {} indices_by_smiles = {} for i, line in enumerate(reader): smiles = line[0] lines_by_smiles[smiles] = line indices_by_smiles[smiles] = i all_split_indices = [] for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]: with open(os.path.join(args.save_dir, name + '_smiles.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(['smiles']) for smiles in dataset.smiles(): writer.writerow([smiles]) with open(os.path.join(args.save_dir, name + '_full.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for smiles in dataset.smiles(): writer.writerow(lines_by_smiles[smiles]) split_indices = [] for smiles in dataset.smiles(): split_indices.append(indices_by_smiles[smiles]) split_indices = sorted(split_indices) all_split_indices.append(split_indices) with open(os.path.join(args.save_dir, 'split_indices.pckl'), 'wb') as f: pickle.dump(all_split_indices, f) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], current_args=args, logger=logger) else: debug(f'Building model {model_idx}') model = build_model(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.cuda() # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in range(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data=train_data, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data=val_data, num_tasks=args.num_tasks, metric_func=metric_func, batch_size=args.batch_size, dataset_type=args.dataset_type, scaler=scaler, logger=logger) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger) test_preds = predict(model=model, data=test_data, batch_size=args.batch_size, scaler=scaler) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score, 0) # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) return ensemble_scores
def hyperopt(args: HyperoptArgs) -> None: """ Runs hyperparameter optimization on a Chemprop model. Hyperparameter optimization optimizes the following parameters: * :code:`hidden_size`: The hidden size of the neural network layers is selected from {300, 400, ..., 2400} * :code:`depth`: The number of message passing iterations is selected from {2, 3, 4, 5, 6} * :code:`dropout`: The dropout probability is selected from {0.0, 0.05, ..., 0.4} * :code:`ffn_num_layers`: The number of feed-forward layers after message passing is selected from {1, 2, 3} The best set of hyperparameters is saved as a JSON file to :code:`args.config_save_path`. :param args: A :class:`~chemprop.args.HyperoptArgs` object containing arguments for hyperparameter optimization in addition to all arguments needed for training. """ # Create logger logger = create_logger(name=HYPEROPT_LOGGER_NAME, save_dir=args.log_dir, quiet=True) # Load in manual trials if args.manual_trial_dirs is not None: manual_trials = load_manual_trials(args.manual_trial_dirs, SPACE.keys(), args) logger.info(f'{len(manual_trials)} manual trials included in hyperparameter search.') else: manual_trials = None logger.info('No manual trials loaded as part of hyperparameter search') makedirs(args.hyperopt_checkpoint_dir) # Define hyperparameter optimization def objective(hyperparams: Dict[str, Union[int, float]], seed: int) -> Dict: # Convert hyperparams from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Copy args hyper_args = deepcopy(args) # Update args with hyperparams if args.save_dir is not None: folder_name = '_'.join(f'{key}_{value}' for key, value in hyperparams.items()) hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name) for key, value in hyperparams.items(): setattr(hyper_args, key, value) hyper_args.ffn_hidden_size = hyper_args.hidden_size # Cross validate mean_score, std_score = cross_validate(args=hyper_args, train_func=run_training) # Record results temp_model = MoleculeModel(hyper_args) num_params = param_count(temp_model) logger.info(f'Trial results with seed {seed}') logger.info(hyperparams) logger.info(f'num params: {num_params:,}') logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}') # Deal with nan if np.isnan(mean_score): if hyper_args.dataset_type == 'classification': mean_score = 0 else: raise ValueError('Can\'t handle nan score for non-classification dataset.') loss = (1 if hyper_args.minimize_score else -1) * mean_score return { 'loss': loss, 'status': 'ok', 'mean_score': mean_score, 'std_score': std_score, 'hyperparams': hyperparams, 'num_params': num_params, 'seed': seed, } # Iterate over a number of trials for i in range(args.num_iters): # run fmin and load trials in single steps to allow for parallel operation trials = load_trials(dir_path=args.hyperopt_checkpoint_dir, previous_trials=manual_trials) if len(trials) >= args.num_iters: break # Set a unique random seed for each trial. Pass it into objective function for logging purposes. hyperopt_seed = get_hyperopt_seed(seed=args.seed, dir_path=args.hyperopt_checkpoint_dir) fmin_objective = partial(objective, seed=hyperopt_seed) os.environ['HYPEROPT_FMIN_SEED'] = str(hyperopt_seed) # this environment variable changes the seed in fmin # Log the start of the trial logger.info(f'Initiating trial with seed {hyperopt_seed}') logger.info(f'Loaded {len(trials)} previous trials') if len(trials) < args.startup_random_iters: random_remaining = args.startup_random_iters - len(trials) logger.info(f'Parameters assigned with random search, {random_remaining} random trials remaining') else: logger.info(f'Parameters assigned with TPE directed search') fmin( fmin_objective, SPACE, algo=partial(tpe.suggest, n_startup_jobs=args.startup_random_iters), max_evals=len(trials) + 1, trials=trials, ) # Create a trials object with only the last instance by merging the last data with an empty trials object last_trial = merge_trials(Trials(), [trials.trials[-1]]) save_trials(args.hyperopt_checkpoint_dir, last_trial, hyperopt_seed) # Report best result all_trials = load_trials(dir_path=args.hyperopt_checkpoint_dir, previous_trials=manual_trials) results = all_trials.results results = [result for result in results if not np.isnan(result['mean_score'])] best_result = min(results, key=lambda result: (1 if args.minimize_score else -1) * result['mean_score']) logger.info(f'Best trial, with seed {best_result["seed"]}') logger.info(best_result['hyperparams']) logger.info(f'num params: {best_result["num_params"]:,}') logger.info(f'{best_result["mean_score"]} +/- {best_result["std_score"]} {args.metric}') # Save best hyperparameter settings as JSON config file makedirs(args.config_save_path, isfile=True) with open(args.config_save_path, 'w') as f: json.dump(best_result['hyperparams'], f, indent=4, sort_keys=True)
model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda) test_smiles, test_targets = test_data.smiles(), test_data.targets() test_preds = predict(model, test_data, args.batch_size) test_scores = evaluate_predictions(test_preds, test_targets, args.num_tasks, metric_func, args.dataset_type) avg_test_score = np.nanmean(test_scores) print(f'Test {args.metric} = {avg_test_score:.4f}') return avg_test_score if __name__ == "__main__": parser = ArgumentParser() parser.add_argument('--source_data_path', required=True) parser.add_argument('--src_batch_size', type=int, default=100) parser.add_argument('--lambda_e', type=float, default=0.1) add_train_args(parser) args = parser.parse_args() modify_train_args(args) all_test_score = np.zeros((args.num_folds, )) for i in range(args.num_folds): fold_dir = os.path.join(args.save_dir, f'fold_{i}') makedirs(fold_dir) all_test_score[i] = run_training(args, fold_dir) mean, std = np.mean(all_test_score), np.std(all_test_score) print(f'{args.num_folds} fold average: {mean:.4f} +/- {std:.4f}')
def cross_validate( args: TrainArgs, train_func: Callable[[TrainArgs, MoleculeDataset, Logger], Dict[str, List[float]]] ) -> Tuple[float, float]: """ Runs k-fold cross-validation. For each of k splits (folds) of the data, trains and tests a model on that split and aggregates the performance across folds. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param train_func: Function which runs training. :return: A tuple containing the mean and standard deviation performance across folds. """ logger = create_logger(name=TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet) if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir args.task_names = get_task_names(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns, ignore_columns=args.ignore_columns) # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Get data debug('Loading data') data = get_data(path=args.data_path, args=args, logger=logger, skip_none_targets=True) validate_dataset_type(data, dataset_type=args.dataset_type) args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Run training on different random seeds for each fold all_scores = defaultdict(list) for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = train_func( args, deepcopy(data), logger) # deepcopy since data may be modified for metric, scores in model_scores.items(): all_scores[metric].append(scores) all_scores = dict(all_scores) # Convert scores to numpy arrays for metric, scores in all_scores.items(): all_scores[metric] = np.array(scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold for fold_num in range(args.num_folds): for metric, scores in all_scores.items(): info( f'\tSeed {init_seed + fold_num} ==> test {metric} = {np.nanmean(scores[fold_num]):.6f}' ) if args.show_individual_scores: for task_name, score in zip(args.task_names, scores[fold_num]): info( f'\t\tSeed {init_seed + fold_num} ==> test {task_name} {metric} = {score:.6f}' ) # Report scores across folds for metric, scores in all_scores.items(): avg_scores = np.nanmean( scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(args.task_names): info( f'\tOverall test {task_name} {metric} = ' f'{np.nanmean(scores[:, task_num]):.6f} +/- {np.nanstd(scores[:, task_num]):.6f}' ) # Save scores with open(os.path.join(save_dir, TEST_SCORES_FILE_NAME), 'w') as f: writer = csv.writer(f) header = ['Task'] for metric in args.metrics: header += [f'Mean {metric}', f'Standard deviation {metric}'] + \ [f'Fold {i} {metric}' for i in range(args.num_folds)] writer.writerow(header) for task_num, task_name in enumerate(args.task_names): row = [task_name] for metric, scores in all_scores.items(): task_scores = scores[:, task_num] mean, std = np.nanmean(task_scores), np.nanstd(task_scores) row += [mean, std] + task_scores.tolist() writer.writerow(row) # Determine mean and std score of main metric avg_scores = np.nanmean(all_scores[args.metric], axis=1) mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) # Optionally merge and save test preds if args.save_preds: all_preds = pd.concat([ pd.read_csv( os.path.join(save_dir, f'fold_{fold_num}', 'test_preds.csv')) for fold_num in range(args.num_folds) ]) all_preds.to_csv(os.path.join(save_dir, 'test_preds.csv'), index=False) return mean_score, std_score
def run_training_gnn_xgb(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Get data debug('Loading data') args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits(train_data=train_data, val_data=val_data, test_data=test_data, data_path=args.data_path, save_dir=args.save_dir) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation val_smiles, val_targets = val_data.smiles(), val_data.targets() test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=logger) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), device=args.device, logger=logger) test_preds, _ = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) _, train_feature = predict(model=model, data_loader=train_data_loader, scaler=scaler) _, val_feature = predict(model=model, data_loader=val_data_loader, scaler=scaler) _, test_feature = predict(model=model, data_loader=test_data_loader, scaler=scaler) return ensemble_scores, train_feature, val_feature, test_feature, train_targets, val_targets, test_targets
def cross_validate(args: TrainArgs, logger: Logger = None) -> Tuple[float, float]: """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir args.task_names = get_task_names(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns, ignore_columns=args.ignore_columns) # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = run_training(args, logger) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) if args.show_individual_scores: for task_name, score in zip(args.task_names, scores): info( f'Seed {init_seed + fold_num} ==> test {task_name} {args.metric} = {score:.6f}' ) # Report scores across models avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(args.task_names): info( f'Overall test {task_name} {args.metric} = ' f'{np.nanmean(all_scores[:, task_num]):.6f} +/- {np.nanstd(all_scores[:, task_num]):.6f}' ) # Save scores with open(os.path.join(save_dir, 'test_scores.csv'), 'w') as f: writer = csv.writer(f) writer.writerow([ 'Task', f'Mean {args.metric}', f'Standard deviation {args.metric}' ] + [f'Fold {i} {args.metric}' for i in range(args.num_folds)]) for task_num, task_name in enumerate(args.task_names): task_scores = all_scores[:, task_num] mean, std = np.nanmean(task_scores), np.nanstd(task_scores) writer.writerow([task_name, mean, std] + task_scores.tolist()) return mean_score, std_score