def run_training(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Get data debug('Loading data') args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits(train_data=train_data, val_data=val_data, test_data=test_data, data_path=args.data_path, save_dir=args.save_dir) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) args.val_data_size = len(val_data) args.test_data_size = len(test_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) # Only using UQ methods if we have to train an estimator if args.uncertainty == 'random_forest' or args.uncertainty == 'gaussian': uncertainty_estimator = uncertainty_estimator_builder( args.uncertainty)(args, train_data, scaler) else: uncertainty_estimator = None # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, args=args, data_loader=val_data_loader, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=logger) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if uncertainty_estimator is not None: uncertainty_estimator.process_model(model) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score, 0) # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) return ensemble_scores, uncertainty_estimator
def run_training(args: Namespace, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set GPU if args.gpu is not None: torch.cuda.set_device(args.gpu) # Print args debug(pformat(vars(args))) # Get data debug('Loading data') args.task_names = get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: with open(args.data_path, 'r') as f: reader = csv.reader(f) header = next(reader) lines_by_smiles = {} indices_by_smiles = {} for i, line in enumerate(reader): smiles = line[0] lines_by_smiles[smiles] = line indices_by_smiles[smiles] = i all_split_indices = [] for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]: with open(os.path.join(args.save_dir, name + '_smiles.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(['smiles']) for smiles in dataset.smiles(): writer.writerow([smiles]) with open(os.path.join(args.save_dir, name + '_full.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for smiles in dataset.smiles(): writer.writerow(lines_by_smiles[smiles]) split_indices = [] for smiles in dataset.smiles(): split_indices.append(indices_by_smiles[smiles]) split_indices = sorted(split_indices) all_split_indices.append(split_indices) with open(os.path.join(args.save_dir, 'split_indices.pckl'), 'wb') as f: pickle.dump(all_split_indices, f) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() # part of normalization ''' scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() ''' # part of unnormalization scaler = None scaled_targets = train_targets train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], current_args=args, logger=logger) '''i=0 for para in model.parameters(): if i in range(1,7): para.requires_grad=False i+=1''' else: debug(f'Building model {model_idx}') model = build_model(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') #for param in model.parameters(): # print(param.requires_grad) if args.cuda: debug('Moving model to cuda') model = model.cuda() # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data=train_data, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data=val_data, num_tasks=args.num_tasks, metric_func=metric_func, batch_size=args.batch_size, dataset_type=args.dataset_type, scaler=scaler, logger=logger) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score '''if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args)''' i = 0 if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: i = 0 best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) elif best_score < avg_val_score: i += 1 if i == 15: break # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger) test_preds, check_fp = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler) # wei, add check_fp to fix the bug of save depth test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score, 0) # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) return ensemble_scores
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if train_args.features_path is not None or train_args.features_generator is not None: if args.features_path is None and args.features_generator is None: raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).' ) # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, args=args, target_columns=[], skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, batch_size=args.batch_size, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions assert len(test_data) == len(avg_preds) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] avg_preds = full_preds test_smiles = full_data.smiles() # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = [] if args.use_compound_names: header.append('compound_names') header.append('smiles') if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) writer.writerow(header) for i in range(len(avg_preds)): row = [] if args.use_compound_names: row.append(compound_names[i]) row.append(test_smiles[i]) if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: row.extend([''] * args.num_tasks) writer.writerow(row) return avg_preds
def train(): """Renders the train page and performs training if request method is POST.""" global PROGRESS, TRAINING warnings, errors = [], [] if request.method == 'GET': return render_train() # Get arguments data_name, epochs, ensemble_size, checkpoint_name = \ request.form['dataName'], int(request.form['epochs']), \ int(request.form['ensembleSize']), request.form['checkpointName'] gpu = request.form.get('gpu') data_path = os.path.join(app.config['DATA_FOLDER'], f'{data_name}.csv') dataset_type = request.form.get('datasetType', 'regression') # Create and modify args parser = ArgumentParser() add_train_args(parser) args = parser.parse_args([]) args.data_path = data_path args.dataset_type = dataset_type args.epochs = epochs args.ensemble_size = ensemble_size # Check if regression/classification selection matches data data = get_data(path=data_path) targets = data.targets() unique_targets = set(np.unique(targets)) if dataset_type == 'classification' and len(unique_targets - {0, 1}) > 0: errors.append( 'Selected classification dataset but not all labels are 0 or 1. Select regression instead.' ) return render_train(warnings=warnings, errors=errors) if dataset_type == 'regression' and unique_targets <= {0, 1}: errors.append( 'Selected regression dataset but all labels are 0 or 1. Select classification instead.' ) return render_train(warnings=warnings, errors=errors) if gpu is not None: if gpu == 'None': args.no_cuda = True else: args.gpu = int(gpu) current_user = request.cookies.get('currentUser') if not current_user: # Use DEFAULT as current user if the client's cookie is not set. current_user = app.config['DEFAULT_USER_ID'] ckpt_id, ckpt_name = db.insert_ckpt(checkpoint_name, current_user, args.dataset_type, args.epochs, args.ensemble_size, len(targets)) with TemporaryDirectory() as temp_dir: args.save_dir = temp_dir modify_train_args(args) process = mp.Process(target=progress_bar, args=(args, PROGRESS)) process.start() TRAINING = 1 # Run training logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) task_scores = run_training(args, logger) process.join() # Reset globals TRAINING = 0 PROGRESS = mp.Value('d', 0.0) # Check if name overlap if checkpoint_name != ckpt_name: warnings.append( name_already_exists_message('Checkpoint', checkpoint_name, ckpt_name)) # Move models for root, _, files in os.walk(args.save_dir): for fname in files: if fname.endswith('.pt'): model_id = db.insert_model(ckpt_id) save_path = os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model_id}.pt') shutil.move(os.path.join(args.save_dir, root, fname), save_path) return render_train(trained=True, metric=args.metric, num_tasks=len(args.task_names), task_names=args.task_names, task_scores=format_float_list(task_scores), mean_score=format_float(np.mean(task_scores)), warnings=warnings, errors=errors)
def run_training(args: Namespace, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set GPU if args.gpu is not None: torch.cuda.set_device(args.gpu) # Print args debug(pformat(vars(args))) # Get data debug('Loading data') args.task_names = get_task_names(args.data_path) desired_labels = get_desired_labels(args, args.task_names) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() args.real_num_tasks = args.num_tasks - args.features_size if args.predict_features else args.num_tasks debug(f'Number of tasks = {args.num_tasks}') if args.dataset_type == 'bert_pretraining': data.bert_init(args, logger) # Split data if args.dataset_type == 'regression_with_binning': # Note: for now, binning based on whole dataset, not just training set data, bin_predictions, regression_data = data args.bin_predictions = bin_predictions debug(f'Splitting data with seed {args.seed}') train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) _, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) else: debug(f'Splitting data with seed {args.seed}') if args.separate_test_set: test_data = get_data(path=args.separate_test_set, args=args, features_path=args.separate_test_set_features, logger=logger) if args.separate_val_set: val_data = get_data( path=args.separate_val_set, args=args, features_path=args.separate_val_set_features, logger=logger) train_data = data # nothing to split; we already got our test and val sets else: train_data, val_data, _ = split_data( data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) # Optionally replace test data with train or val data if args.test_split == 'train': test_data = train_data elif args.test_split == 'val': test_data = val_data if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.class_balance: train_class_sizes = get_class_sizes(train_data) class_batch_counts = torch.Tensor( train_class_sizes) * args.batch_size args.class_weights = 1 / torch.Tensor(class_batch_counts) if args.save_smiles_splits: with open(args.data_path, 'r') as f: reader = csv.reader(f) header = next(reader) lines_by_smiles = {} indices_by_smiles = {} for i, line in enumerate(reader): smiles = line[0] lines_by_smiles[smiles] = line indices_by_smiles[smiles] = i all_split_indices = [] for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]: with open(os.path.join(args.save_dir, name + '_smiles.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(['smiles']) for smiles in dataset.smiles(): writer.writerow([smiles]) with open(os.path.join(args.save_dir, name + '_full.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for smiles in dataset.smiles(): writer.writerow(lines_by_smiles[smiles]) split_indices = [] for smiles in dataset.smiles(): split_indices.append(indices_by_smiles[smiles]) split_indices = sorted(split_indices) all_split_indices.append(split_indices) with open(os.path.join(args.save_dir, 'split_indices.pckl'), 'wb') as f: pickle.dump(all_split_indices, f) if args.features_scaling: features_scaler = train_data.normalize_features( replace_nan_token=None if args.predict_features else 0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len( train_data ) if args.prespecified_chunk_dir is None else args.prespecified_chunks_max_examples_per_epoch if args.adversarial or args.moe: val_smiles, test_smiles = val_data.smiles(), test_data.smiles() debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Optionally truncate outlier values if args.truncate_outliers: print('Truncating outliers in train set') train_data = truncate_outliers(train_data) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression' and args.target_scaling: debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None if args.moe: train_data = cluster_split(train_data, args.num_sources, args.cluster_max_ratio, seed=args.cluster_split_seed, logger=logger) # Chunk training data if too large to load in memory all at once if args.num_chunks > 1: os.makedirs(args.chunk_temp_dir, exist_ok=True) train_paths = [] if args.moe: chunked_sources = [td.chunk(args.num_chunks) for td in train_data] chunks = [] for i in range(args.num_chunks): chunks.append([source[i] for source in chunked_sources]) else: chunks = train_data.chunk(args.num_chunks) for i in range(args.num_chunks): chunk_path = os.path.join(args.chunk_temp_dir, str(i) + '.txt') memo_path = os.path.join(args.chunk_temp_dir, 'memo' + str(i) + '.txt') with open(chunk_path, 'wb') as f: pickle.dump(chunks[i], f) train_paths.append((chunk_path, memo_path)) train_data = train_paths # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric, args=args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.maml: # TODO refactor test_targets = [] for task_idx in range(len(data.data[0].targets)): _, task_test_data, _ = test_data.sample_maml_task(args, seed=0) test_targets += task_test_data.targets() if args.dataset_type == 'bert_pretraining': sum_test_preds = { 'features': np.zeros((len(test_smiles), args.features_size)) if args.features_size is not None else None, 'vocab': np.zeros((len(test_targets['vocab']), args.vocab.output_size)) } elif args.dataset_type == 'kernel': sum_test_preds = np.zeros((len(test_targets), args.num_tasks)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) if args.maml: sum_test_preds = None # annoying to determine exact size; will initialize later if args.dataset_type == 'bert_pretraining': # Only predict targets that are masked out test_targets['vocab'] = [ target if mask == 0 else None for target, mask in zip(test_targets['vocab'], test_data.mask()) ] # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') os.makedirs(save_dir, exist_ok=True) writer = SummaryWriter(log_dir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], current_args=args, logger=logger) else: debug(f'Building model {model_idx}') model = build_model(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.cuda() # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) if args.adjust_weight_decay: args.pnorm_target = compute_pnorm(model) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') if args.prespecified_chunk_dir is not None: # load some different random chunks each epoch train_data, val_data = load_prespecified_chunks(args, logger) debug('Loaded prespecified chunks for epoch') if args.dataset_type == 'unsupervised': # won't work with moe full_data = MoleculeDataset(train_data.data + val_data.data) generate_unsupervised_cluster_labels( build_model(args), full_data, args) # cluster with a new random init model.create_ffn( args ) # reset the ffn since we're changing targets-- we're just pretraining the encoder. optimizer.param_groups.pop() # remove ffn parameters optimizer.add_param_group({ 'params': model.ffn.parameters(), 'lr': args.init_lr[1], 'weight_decay': args.weight_decay[1] }) if args.cuda: model.ffn.cuda() if args.gradual_unfreezing: if epoch % args.epochs_per_unfreeze == 0: unfroze_layer = model.unfreeze_next( ) # consider just stopping early after we have nothing left to unfreeze? if unfroze_layer: debug('Unfroze last frozen layer') n_iter = train(model=model, data=train_data, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer, chunk_names=(args.num_chunks > 1), val_smiles=val_smiles if args.adversarial else None, test_smiles=test_smiles if args.adversarial or args.moe else None) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data=val_data, metric_func=metric_func, args=args, scaler=scaler, logger=logger) if args.dataset_type == 'bert_pretraining': if val_scores['features'] is not None: debug( f'Validation features rmse = {val_scores["features"]:.6f}' ) writer.add_scalar('validation_features_rmse', val_scores['features'], n_iter) val_scores = [val_scores['vocab']] # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): if task_name in desired_labels: debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar( f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score, or always save it if unsupervised if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score or \ args.dataset_type == 'unsupervised': best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) if args.dataset_type == 'unsupervised': return [0] # rest of this is meaningless when unsupervised # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger) if args.split_test_by_overlap_dataset is not None: overlap_data = get_data(path=args.split_test_by_overlap_dataset, logger=logger) overlap_smiles = set(overlap_data.smiles()) test_data_intersect, test_data_nonintersect = [], [] for d in test_data.data: if d.smiles in overlap_smiles: test_data_intersect.append(d) else: test_data_nonintersect.append(d) test_data_intersect, test_data_nonintersect = MoleculeDataset( test_data_intersect), MoleculeDataset(test_data_nonintersect) for name, td in [('Intersect', test_data_intersect), ('Nonintersect', test_data_nonintersect)]: test_preds = predict(model=model, data=td, args=args, scaler=scaler, logger=logger) test_scores = evaluate_predictions( preds=test_preds, targets=td.targets(), metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger) avg_test_score = np.nanmean(test_scores) info( f'Model {model_idx} test {args.metric} for {name} = {avg_test_score:.6f}' ) if len( test_data ) == 0: # just get some garbage results without crashing; in this case we didn't care anyway test_preds, test_scores = sum_test_preds, [ 0 for _ in range(len(args.task_names)) ] else: test_preds = predict(model=model, data=test_data, args=args, scaler=scaler, logger=logger) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger) if args.maml: if sum_test_preds is None: sum_test_preds = np.zeros(np.array(test_preds).shape) if args.dataset_type == 'bert_pretraining': if test_preds['features'] is not None: sum_test_preds['features'] += np.array(test_preds['features']) sum_test_preds['vocab'] += np.array(test_preds['vocab']) else: sum_test_preds += np.array(test_preds) if args.dataset_type == 'bert_pretraining': if test_preds['features'] is not None: debug( f'Model {model_idx} test features rmse = {test_scores["features"]:.6f}' ) writer.add_scalar('test_features_rmse', test_scores['features'], 0) test_scores = [test_scores['vocab']] # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): if task_name in desired_labels: info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) # Evaluate ensemble on test set if args.dataset_type == 'bert_pretraining': avg_test_preds = { 'features': (sum_test_preds['features'] / args.ensemble_size).tolist() if sum_test_preds['features'] is not None else None, 'vocab': (sum_test_preds['vocab'] / args.ensemble_size).tolist() } else: avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() if len(test_data ) == 0: # just return some garbage when we didn't want test data ensemble_scores = test_scores else: ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger) # Average ensemble score if args.dataset_type == 'bert_pretraining': if ensemble_scores['features'] is not None: info( f'Ensemble test features rmse = {ensemble_scores["features"]:.6f}' ) writer.add_scalar('ensemble_test_features_rmse', ensemble_scores['features'], 0) ensemble_scores = [ensemble_scores['vocab']] avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score, 0) # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) return ensemble_scores
def run_sklearn(args: SklearnTrainArgs, logger: Logger = None) -> List[float]: if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print debug(pformat(vars(args))) metric_func = get_metric_func(args.metric) debug('Loading data') data = get_data(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns) if args.model_type == 'svm' and data.num_tasks() != 1: raise ValueError( f'SVM can only handle single-task data but found {data.num_tasks()} tasks' ) debug(f'Splitting data with seed {args.seed}') # Need to have val set so that train and test sets are the same as when doing MPN train_data, _, test_data = split_data(data=data, split_type=args.split_type, seed=args.seed, sizes=args.split_sizes, args=args) debug( f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}' ) debug('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for dataset in [train_data, test_data]: for datapoint in tqdm(dataset, total=len(dataset)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) debug('Building model') if args.dataset_type == 'regression': if args.model_type == 'random_forest': model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1) elif args.model_type == 'svm': model = SVR() else: raise ValueError(f'Model type "{args.model_type}" not supported') elif args.dataset_type == 'classification': if args.model_type == 'random_forest': model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1, class_weight=args.class_weight) elif args.model_type == 'svm': model = SVC() else: raise ValueError(f'Model type "{args.model_type}" not supported') else: raise ValueError(f'Dataset type "{args.dataset_type}" not supported') debug(model) debug('Training') if args.single_task: scores = single_task_sklearn(model=model, train_data=train_data, test_data=test_data, metric_func=metric_func, args=args, logger=logger) else: scores = multi_task_sklearn(model=model, train_data=train_data, test_data=test_data, metric_func=metric_func, args=args, logger=logger) info(f'Test {args.metric} = {np.nanmean(scores)}') return scores
import os import sys from rdkit import Chem from rdkit.Chem import Crippen sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) from chemprop.data.utils import get_data if __name__ == "__main__": datasets = ['lipo', 'delaney', 'freesolv', 'qm7'] full_molecule_set = set() for dataset in datasets: data = get_data(path=f'data/{dataset}.csv') for smile in data.smiles(): if smile not in full_molecule_set: full_molecule_set.add(smile) full_molecule_list = list(full_molecule_set) logp_list = [] for molecule in full_molecule_list: logp_list.append(Crippen.MolLogP(Chem.MolFromSmiles(molecule))) with open('data/logp.csv', 'w+') as logp_csv: csv_writer = csv.writer(logp_csv, delimiter=',') csv_writer.writerow(['smiles', 'logp']) csv_writer.writerows([[full_molecule_list[i], logp_list[i]] for i in range(len(full_molecule_list))])
print(' | '.join([f'{i}% = {np.percentile(similarities, i):.4f}' for i in range(0, 101, 10)])) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--data_path_1', type=str, required=True, help='Path to first data CSV file') parser.add_argument('--data_path_2', type=str, required=True, help='Path to second data CSV file') parser.add_argument('--use_compound_names_1', action='store_true', default=False, help='Whether data_path_1 has compound names in addition to smiles') parser.add_argument('--use_compound_names_2', action='store_true', default=False, help='Whether data_path_2 has compound names in addition to smiles') parser.add_argument('--similarity_measure', type=str, required=True, choices=['scaffold', 'morgan'], help='Similarity measure to use to compare the two datasets') parser.add_argument('--radius', type=int, default=3, help='Radius of Morgan fingerprint') parser.add_argument('--sample_rate', type=float, default=1.0, help='Rate at which to sample pairs of molecules for Morgan similarity (to reduce time)') args = parser.parse_args() data_1 = get_data(path=args.data_path_1, use_compound_names=args.use_compound_names_1) data_2 = get_data(path=args.data_path_2, use_compound_names=args.use_compound_names_2) if args.similarity_measure == 'scaffold': scaffold_similarity(data_1.smiles(), data_2.smiles()) elif args.similarity_measure == 'morgan': morgan_similarity(data_1.smiles(), data_2.smiles(), args.radius, args.sample_rate) else: raise ValueError(f'Similarity measure "{args.similarity_measure}" not supported.')
def visualize_encoding_property_space(args: Namespace): # Load data data = get_data(path=args.data_path) # Sort according to similarity measure if args.similarity_measure == 'property': data.sort(key=lambda d: d.targets[args.task_index]) elif args.similarity_measure == 'random': data.shuffle(args.seed) else: raise ValueError( f'similarity_measure "{args.similarity_measure}" not supported or not implemented yet.' ) # Load model and scalers model = load_checkpoint(args.checkpoint_path) scaler, features_scaler = load_scalers(args.checkpoint_path) data.normalize_features(features_scaler) # Random seed if args.seed is not None: random.seed(args.seed) # Generate visualizations for i in trange(args.num_examples): # Get random three molecules with similar properties index = random.randint(1, len(data) - 2) molecules = MoleculeDataset(data[index - 1:index + 2]) molecule_targets = [t[args.task_index] for t in molecules.targets()] # Encode three molecules molecule_encodings = model.encoder(molecules.smiles()) # Define interpolation def predict_property(point: List[int]) -> float: # Return true value on endpoints of triangle argmax = np.argmax(point) if point[argmax] == 1: return molecule_targets[argmax] # Interpolate and predict task value encoding = sum(point[j] * molecule_encodings[j] for j in range(len(molecule_encodings))) pred = model.ffn(encoding).data.cpu().numpy() pred = scaler.inverse_transform(pred) pred = pred.item() return pred # Create visualization scale = 20 fontsize = 6 figure, tax = ternary.figure(scale=scale) tax.heatmapf(predict_property, boundary=True, style="hexagonal") tax.set_title("Property Prediction") tax.right_axis_label( f'{molecules[0].smiles} ({molecules[0].targets[args.task_index]:.6f}) -->', fontsize=fontsize) tax.left_axis_label( f'{molecules[1].smiles} ({molecules[1].targets[args.task_index]:.6f}) -->', fontsize=fontsize) tax.bottom_axis_label( f'<-- {molecules[2].smiles} ({molecules[2].targets[args.task_index]:.6f})', fontsize=fontsize) tax.savefig(os.path.join(args.save_dir, f'{i}.png'))
print() print( f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}' ) print(f'Minimum dice similarity = {np.min(similarities):.4f}') print(f'Maximum dice similarity = {np.max(similarities):.4f}') print() print('Percentiles for dice similarity') print(' | '.join([ f'{i}% = {np.percentile(similarities, i):.4f}' for i in range(0, 101, 10) ])) if __name__ == '__main__': args = Args().parse_args() data_1 = get_data(path=args.data_path_1, smiles_column=args.smiles_column_1) data_2 = get_data(path=args.data_path_2, smiles_column=args.smiles_column_2) if args.similarity_measure == 'scaffold': scaffold_similarity(data_1.smiles(), data_2.smiles()) elif args.similarity_measure == 'morgan': morgan_similarity(data_1.smiles(), data_2.smiles(), args.radius, args.sample_rate) else: raise ValueError( f'Similarity measure "{args.similarity_measure}" not supported.')
def predict_sklearn(args: SklearnPredictArgs): if args.parcel_size and args.max_data_size: num_iterations = math.ceil(args.max_data_size / args.parcel_size) max_data_size = args.parcel_size else: num_iterations = 1 max_data_size = args.max_data_size offset = 0 for iteration in range(num_iterations): if iteration > 0: offset = offset + args.parcel_size max_data_size = max_data_size + args.parcel_size print('Loading data') data = get_data(path=args.test_path, smiles_column=args.smiles_column, target_columns=[], max_data_size=max_data_size, data_offset=offset) print('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for datapoint in tqdm(data, total=len(data)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models' ) sum_preds = np.zeros((len(data), args.num_tasks)) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): with open(checkpoint_path, 'rb') as f: model = pickle.load(f) model_preds = predict(model=model, model_type=args.model_type, dataset_type=args.dataset_type, features=data.features()) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() print(f'Saving predictions to {args.preds_path}') assert len(data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Copy predictions over to data task_names = get_task_names(path=args.test_path) for datapoint, preds in zip(data, avg_preds): for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save if iteration != 0: name, ext = os.path.splitext(args.preds_path) preds_path = "{name}.{it}{ext}".format(name=name, it=iteration, ext=ext) else: preds_path = args.preds_path with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=data[0].row.keys()) writer.writeheader() for datapoint in data: writer.writerow(datapoint.row)