def predict_smile(checkpoint_path: str, smile: str):
    smiles = [smile]
    """
        Makes predictions. If smiles is provided, makes predictions on smiles.
        Otherwise makes predictions on args.test_data.

        :param args: Arguments.
        :param smiles: Smiles to make predictions on.
        :return: A list of lists of target predictions.
        """
    args = Namespace()
    # print('Loading training args')
    scaler, features_scaler = load_scalers(checkpoint_path)
    train_args = load_args(checkpoint_path)

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    # print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
    else:
        print("Enter Valid Smile String")
        return

    # print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))

    model = load_checkpoint(checkpoint_path, cuda=args.cuda)
    model_preds = predict(model=model,
                          data=test_data,
                          batch_size=1,
                          scaler=scaler)
    sum_preds += np.array(model_preds)

    # Ensemble predictions
    return sum_preds[0][0]
예제 #2
0
    def calculate_predictions(self):
        for i, (model, scaler_list) in enumerate(
                tqdm(zip(self.models, self.scalers), total=self.num_models)):
            (
                scaler,
                features_scaler,
                atom_descriptor_scaler,
                bond_feature_scaler,
            ) = scaler_list
            if (features_scaler is not None
                    or atom_descriptor_scaler is not None
                    or bond_feature_scaler is not None):
                self.test_data.reset_features_and_targets()
                if features_scaler is not None:
                    self.test_data.normalize_features(features_scaler)
                if atom_descriptor_scaler is not None:
                    self.test_data.normalize_features(
                        atom_descriptor_scaler, scale_atom_descriptors=True)
                if bond_feature_scaler is not None:
                    self.test_data.normalize_features(bond_feature_scaler,
                                                      scale_bond_features=True)
            preds = predict(
                model=model,
                data_loader=self.test_data_loader,
                scaler=scaler,
                return_unc_parameters=False,
            )
            if self.dataset_type == "spectra":
                preds = normalize_spectra(
                    spectra=preds,
                    phase_features=self.test_data.phase_features(),
                    phase_mask=self.spectra_phase_mask,
                    excluded_sub_value=float("nan"),
                )
            if i == 0:
                sum_preds = np.array(preds)
                sum_squared = np.square(preds)
                if self.individual_ensemble_predictions:
                    individual_preds = np.expand_dims(np.array(preds), axis=-1)
                if model.train_class_sizes is not None:
                    self.train_class_sizes = [model.train_class_sizes]
            else:
                sum_preds += np.array(preds)
                sum_squared += np.square(preds)
                if self.individual_ensemble_predictions:
                    individual_preds = np.append(individual_preds,
                                                 np.expand_dims(preds,
                                                                axis=-1),
                                                 axis=-1)
                if model.train_class_sizes is not None:
                    self.train_class_sizes.append(model.train_class_sizes)

        uncal_preds = sum_preds / self.num_models
        uncal_vars = sum_squared / self.num_models \
            - np.square(sum_preds) / self.num_models ** 2
        self.uncal_preds, self.uncal_vars = uncal_preds.tolist(
        ), uncal_vars.tolist()
        if self.individual_ensemble_predictions:
            self.individual_preds = individual_preds.tolist()
예제 #3
0
    def calculate_predictions(self):
        for i, (model, scaler_list) in enumerate(
                tqdm(zip(self.models, self.scalers), total=self.num_models)):
            (
                scaler,
                features_scaler,
                atom_descriptor_scaler,
                bond_feature_scaler,
            ) = scaler_list
            if (features_scaler is not None
                    or atom_descriptor_scaler is not None
                    or bond_feature_scaler is not None):
                self.test_data.reset_features_and_targets()
                if features_scaler is not None:
                    self.test_data.normalize_features(features_scaler)
                if atom_descriptor_scaler is not None:
                    self.test_data.normalize_features(
                        atom_descriptor_scaler, scale_atom_descriptors=True)
                if bond_feature_scaler is not None:
                    self.test_data.normalize_features(bond_feature_scaler,
                                                      scale_bond_features=True)

            preds, lambdas, alphas, betas = predict(
                model=model,
                data_loader=self.test_data_loader,
                scaler=scaler,
                return_unc_parameters=True,
            )
            var = np.array(betas) / (np.array(lambdas) *
                                     (np.array(alphas) - 1))
            if i == 0:
                sum_preds = np.array(preds)
                sum_squared = np.square(preds)
                sum_vars = np.array(var)
                individual_vars = [var]
                if self.individual_ensemble_predictions:
                    individual_preds = np.expand_dims(np.array(preds), axis=-1)
            else:
                sum_preds += np.array(preds)
                sum_squared += np.square(preds)
                sum_vars += np.array(var)
                individual_vars.append(var)
                if self.individual_ensemble_predictions:
                    individual_preds = np.append(individual_preds,
                                                 np.expand_dims(preds,
                                                                axis=-1),
                                                 axis=-1)

        uncal_preds = sum_preds / self.num_models
        uncal_vars = (sum_vars + sum_squared) / self.num_models \
            - np.square(sum_preds / self.num_models)
        self.uncal_preds, self.uncal_vars = uncal_preds.tolist(
        ), uncal_vars.tolist()
        self.individual_vars = individual_vars
        if self.individual_ensemble_predictions:
            self.individual_preds = individual_preds.tolist()
예제 #4
0
    def calculate_predictions(self):
        for i, (model, scaler_list) in enumerate(
                tqdm(zip(self.models, self.scalers), total=self.num_models)):
            (
                scaler,
                features_scaler,
                atom_descriptor_scaler,
                bond_feature_scaler,
            ) = scaler_list
            if (features_scaler is not None
                    or atom_descriptor_scaler is not None
                    or bond_feature_scaler is not None):
                self.test_data.reset_features_and_targets()
                if features_scaler is not None:
                    self.test_data.normalize_features(features_scaler)
                if atom_descriptor_scaler is not None:
                    self.test_data.normalize_features(
                        atom_descriptor_scaler, scale_atom_descriptors=True)
                if bond_feature_scaler is not None:
                    self.test_data.normalize_features(bond_feature_scaler,
                                                      scale_bond_features=True)

            preds = predict(
                model=model,
                data_loader=self.test_data_loader,
                scaler=scaler,
                return_unc_parameters=False,
            )
            if i == 0:
                sum_preds = np.array(preds)
                if self.individual_ensemble_predictions:
                    individual_preds = np.expand_dims(np.array(preds), axis=-1)
                if model.train_class_sizes is not None:
                    self.train_class_sizes = [model.train_class_sizes]
            else:
                sum_preds += np.array(preds)
                if self.individual_ensemble_predictions:
                    individual_preds = np.append(individual_preds,
                                                 np.expand_dims(preds,
                                                                axis=-1),
                                                 axis=-1)
                if model.train_class_sizes is not None:
                    self.train_class_sizes.append(model.train_class_sizes)

        self.uncal_preds = (sum_preds / self.num_models).tolist()
        self.uncal_confidence = self.uncal_preds
        if self.individual_ensemble_predictions:
            self.individual_preds = individual_preds.tolist()
예제 #5
0
    def calculate_predictions(self):
        model = next(self.models)
        (
            scaler,
            features_scaler,
            atom_descriptor_scaler,
            bond_feature_scaler,
        ) = next(self.scalers)
        if (features_scaler is not None or atom_descriptor_scaler is not None
                or bond_feature_scaler is not None):
            self.test_data.reset_features_and_targets()
            if features_scaler is not None:
                self.test_data.normalize_features(features_scaler)
            if atom_descriptor_scaler is not None:
                self.test_data.normalize_features(atom_descriptor_scaler,
                                                  scale_atom_descriptors=True)
            if bond_feature_scaler is not None:
                self.test_data.normalize_features(bond_feature_scaler,
                                                  scale_bond_features=True)
        for i in range(self.dropout_sampling_size):
            preds = predict(
                model=model,
                data_loader=self.test_data_loader,
                scaler=scaler,
                return_unc_parameters=False,
                dropout_prob=self.uncertainty_dropout_p,
            )
            if i == 0:
                sum_preds = np.array(preds)
                sum_squared = np.square(preds)
            else:
                sum_preds += np.array(preds)
                sum_squared += np.square(preds)

        uncal_preds = sum_preds / self.dropout_sampling_size
        uncal_vars = sum_squared / self.dropout_sampling_size \
            - np.square(sum_preds) / self.dropout_sampling_size ** 2
        self.uncal_preds, self.uncal_vars = uncal_preds.tolist(
        ), uncal_vars.tolist()
    def __call__(
            self,
            smiles: List[Optional[str]] = None,
            smiles2: List[Optional[str]] = None
    ) -> List[Optional[List[float]]]:
        if self.computed_prop:
            # Identity non-None smiles
            if len(smiles) > 0:
                valid_indices, valid_smiles = zip(
                    *[(i, smile) for i, smile in enumerate(smiles)
                      if smile is not None])
            else:
                valid_indices, valid_smiles = [], []

            valid_props = [
                self.scorer(valid_smile) for valid_smile in valid_smiles
            ]

            # Combine properties of non-None smiles with Nones
            props = [None] * len(smiles)
            for i, prop in zip(valid_indices, valid_props):
                props[i] = prop

            return props

        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])
        if self.features_generator is not None:
            self.generate_features(test_data)
            valid_indices = [
                i for i in range(len(test_data))
                if test_data[i].mol is not None
                and test_data[i].features is not None
            ]
            test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        # Edge case if empty list of smiles is provided
        if len(test_data) == 0:
            return [None] * len(full_data)

        # Normalize features
        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        # Predict with each model individually and sum predictions
        sum_preds = np.zeros((len(test_data), self.num_tasks))
        for chemprop_model in self.chemprop_models:
            model_preds = predict(model=chemprop_model,
                                  data=test_data,
                                  batch_size=self.batch_size,
                                  scaler=self.scaler)
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(self.chemprop_models)
        avg_preds = avg_preds.tolist()

        # Put Nones for invalid smiles
        full_preds = [None] * len(full_data)
        for i, si in enumerate(valid_indices):
            full_preds[si] = avg_preds[i]

        if self.neg_threshold:
            return [
                -p[self.prop_index] if p is not None else None
                for p in full_preds
            ]
        else:
            return [
                p[self.prop_index] if p is not None else None
                for p in full_preds
            ]
예제 #7
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
    else:
        test_data = get_data(path=args.test_path,
                             args=args,
                             use_compound_names=args.use_compound_names,
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds = predict(model=model,
                              data=test_data,
                              batch_size=args.batch_size,
                              scaler=scaler)
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    print(f'Saving predictions to {args.preds_path}')

    # Put Nones for invalid smiles
    full_preds = [None] * len(full_data)
    for i, si in enumerate(valid_indices):
        full_preds[si] = avg_preds[i]
    avg_preds = full_preds
    test_smiles = full_data.smiles()

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)

        header = []

        if args.use_compound_names:
            header.append('compound_names')

        header.append('smiles')

        if args.dataset_type == 'multiclass':
            for name in args.task_names:
                for i in range(args.multiclass_num_classes):
                    header.append(name + '_class' + str(i))
        else:
            header.extend(args.task_names)
        writer.writerow(header)

        for i in range(len(avg_preds)):
            row = []

            if args.use_compound_names:
                row.append(compound_names[i])

            row.append(test_smiles[i])

            if avg_preds[i] is not None:
                if args.dataset_type == 'multiclass':
                    for task_probs in avg_preds[i]:
                        row.extend(task_probs)
                else:
                    row.extend(avg_preds[i])
            else:
                if args.dataset_type == 'multiclass':
                    row.extend([''] * args.num_tasks *
                               args.multiclass_num_classes)
                else:
                    row.extend([''] * args.num_tasks)

            writer.writerow(row)

    return avg_preds
예제 #8
0
def run_training(args: Namespace, logger: Logger = None) -> List[float]:
    """
    Trains a model and returns test scores on the model checkpoint with the highest validation score.

    :param args: Arguments.
    :param logger: Logger.
    :return: A list of ensemble scores for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Set GPU
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    # Print args
    debug(pformat(vars(args)))

    # Get data
    debug('Loading data')
    args.task_names = get_task_names(args.data_path)
    data = get_data(path=args.data_path, args=args, logger=logger)
    args.num_tasks = data.num_tasks()
    args.features_size = data.features_size()
    debug(f'Number of tasks = {args.num_tasks}')

    # Split data
    debug(f'Splitting data with seed {args.seed}')
    if args.separate_test_path:
        test_data = get_data(path=args.separate_test_path,
                             args=args,
                             features_path=args.separate_test_features_path,
                             logger=logger)
    if args.separate_val_path:
        val_data = get_data(path=args.separate_val_path,
                            args=args,
                            features_path=args.separate_val_features_path,
                            logger=logger)

    if args.separate_val_path and args.separate_test_path:
        train_data = data
    elif args.separate_val_path:
        train_data, _, test_data = split_data(data=data,
                                              split_type=args.split_type,
                                              sizes=(0.8, 0.2, 0.0),
                                              seed=args.seed,
                                              args=args,
                                              logger=logger)
    elif args.separate_test_path:
        train_data, val_data, _ = split_data(data=data,
                                             split_type=args.split_type,
                                             sizes=(0.8, 0.2, 0.0),
                                             seed=args.seed,
                                             args=args,
                                             logger=logger)
    else:
        train_data, val_data, test_data = split_data(
            data=data,
            split_type=args.split_type,
            sizes=args.split_sizes,
            seed=args.seed,
            args=args,
            logger=logger)

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(
                f'{args.task_names[i]} '
                f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}'
            )

    if args.save_smiles_splits:
        with open(args.data_path, 'r') as f:
            reader = csv.reader(f)
            header = next(reader)

            lines_by_smiles = {}
            indices_by_smiles = {}
            for i, line in enumerate(reader):
                smiles = line[0]
                lines_by_smiles[smiles] = line
                indices_by_smiles[smiles] = i

        all_split_indices = []
        for dataset, name in [(train_data, 'train'), (val_data, 'val'),
                              (test_data, 'test')]:
            with open(os.path.join(args.save_dir, name + '_smiles.csv'),
                      'w') as f:
                writer = csv.writer(f)
                writer.writerow(['smiles'])
                for smiles in dataset.smiles():
                    writer.writerow([smiles])
            with open(os.path.join(args.save_dir, name + '_full.csv'),
                      'w') as f:
                writer = csv.writer(f)
                writer.writerow(header)
                for smiles in dataset.smiles():
                    writer.writerow(lines_by_smiles[smiles])
            split_indices = []
            for smiles in dataset.smiles():
                split_indices.append(indices_by_smiles[smiles])
                split_indices = sorted(split_indices)
            all_split_indices.append(split_indices)
        with open(os.path.join(args.save_dir, 'split_indices.pckl'),
                  'wb') as f:
            pickle.dump(all_split_indices, f)

    if args.features_scaling:
        features_scaler = train_data.normalize_features(replace_nan_token=0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    args.train_data_size = len(train_data)

    debug(
        f'Total size = {len(data):,} | '
        f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}'
    )

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        train_smiles, train_targets = train_data.smiles(), train_data.targets()
        scaler = StandardScaler().fit(train_targets)
        scaled_targets = scaler.transform(train_targets).tolist()
        train_data.set_targets(scaled_targets)
    else:
        scaler = None

    # Get loss and metric functions
    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric)

    # Set up test set evaluation
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    if args.dataset_type == 'multiclass':
        sum_test_preds = np.zeros(
            (len(test_smiles), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        makedirs(save_dir)
        try:
            writer = SummaryWriter(log_dir=save_dir)
        except:
            writer = SummaryWriter(logdir=save_dir)
        # Load/build model
        if args.checkpoint_paths is not None:
            debug(
                f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}'
            )
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    current_args=args,
                                    logger=logger)
        else:
            debug(f'Building model {model_idx}')
            model = build_model(args)

        debug(model)
        debug(f'Number of parameters = {param_count(model):,}')
        if args.cuda:
            debug('Moving model to cuda')
            model = model.cuda()

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler,
                        features_scaler, args)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug(f'Epoch {epoch}')

            n_iter = train(model=model,
                           data=train_data,
                           loss_func=loss_func,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           args=args,
                           n_iter=n_iter,
                           logger=logger,
                           writer=writer)
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(model=model,
                                  data=val_data,
                                  num_tasks=args.num_tasks,
                                  metric_func=metric_func,
                                  batch_size=args.batch_size,
                                  dataset_type=args.dataset_type,
                                  scaler=scaler,
                                  logger=logger)

            # Average validation score
            avg_val_score = np.nanmean(val_scores)
            debug(f'Validation {args.metric} = {avg_val_score:.6f}')
            writer.add_scalar(f'validation_{args.metric}', avg_val_score,
                              n_iter)

            if args.show_individual_scores:
                # Individual validation scores
                for task_name, val_score in zip(args.task_names, val_scores):
                    debug(
                        f'Validation {task_name} {args.metric} = {val_score:.6f}'
                    )
                    writer.add_scalar(f'validation_{task_name}_{args.metric}',
                                      val_score, n_iter)

            # Save model checkpoint if improved validation score
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir, 'model.pt'), model,
                                scaler, features_scaler, args)

        # Evaluate on test set using model with best validation score
        info(
            f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
        )
        model = load_checkpoint(os.path.join(save_dir, 'model.pt'),
                                cuda=args.cuda,
                                logger=logger)

        test_preds = predict(model=model,
                             data=test_data,
                             batch_size=args.batch_size,
                             scaler=scaler)
        test_scores = evaluate_predictions(preds=test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metric_func=metric_func,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

        if len(test_preds) != 0:
            sum_test_preds += np.array(test_preds)

        # Average test score
        avg_test_score = np.nanmean(test_scores)
        info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}')
        writer.add_scalar(f'test_{args.metric}', avg_test_score, 0)

        if args.show_individual_scores:
            # Individual test scores
            for task_name, test_score in zip(args.task_names, test_scores):
                info(
                    f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}'
                )
                writer.add_scalar(f'test_{task_name}_{args.metric}',
                                  test_score, n_iter)

    # Evaluate ensemble on test set
    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    ensemble_scores = evaluate_predictions(preds=avg_test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metric_func=metric_func,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

    # Average ensemble score
    avg_ensemble_test_score = np.nanmean(ensemble_scores)
    info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}')
    writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score,
                      0)

    # Individual ensemble scores
    if args.show_individual_scores:
        for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
            info(
                f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}'
            )

    return ensemble_scores