コード例 #1
0
def run_meta_training(args: TrainArgs, logger: Logger = None) -> List[float]:
    """
    Trains a model and returns test scores on the model checkpoint with the highest validation score.

    :param args: Arguments.
    :param logger: Logger.
    :return: A list of ensemble scores for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Print command line
    debug('Command line')
    debug(f'python {" ".join(sys.argv)}')

    # Print args
    debug('Args')
    debug(args)

    # Save args
    args.save(os.path.join(args.save_dir, 'args.json'))

    # Set pytorch seed for random initial weights
    torch.manual_seed(args.pytorch_seed)

    # Get data
    debug('Loading data')
    args.task_names = args.target_columns or get_task_names(args.data_path)
    data = get_data(path=args.data_path, args=args, logger=logger)
    args.num_tasks = data.num_tasks()
    args.features_size = data.features_size()
    debug(f'Number of tasks = {args.num_tasks}')

    # Split data
    # debug(f'Splitting data with seed {args.seed}')
    # if args.separate_test_path:
    #     test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger)
    # if args.separate_val_path:
    #     val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger)

    # if args.separate_val_path and args.separate_test_path:
    #     train_data = data
    # elif args.separate_val_path:
    #     train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, args=args, logger=logger)
    # elif args.separate_test_path:
    #     train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
    # else:
    #     train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(f'{args.task_names[i]} '
                  f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}')

    # if args.save_smiles_splits:
    #     save_smiles_splits(
    #         train_data=train_data,
    #         val_data=val_data,
    #         test_data=test_data,
    #         data_path=args.data_path,
    #         save_dir=args.save_dir
    #     )

    # If this happens, then need to move this logic into the task data loader
    # when it creates the datasets! 
    # if args.features_scaling:
    #     features_scaler = train_data.normalize_features(replace_nan_token=0)
    #     val_data.normalize_features(features_scaler)
    #     test_data.normalize_features(features_scaler)
    # else:
    #     features_scaler = None

    # args.train_data_size = len(train_data)
    
    # debug(f'Total size = {len(data):,} | '
    #       f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}')

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    # if args.dataset_type == 'regression':
    #     debug('Fitting scaler')
    #     train_smiles, train_targets = train_data.smiles(), train_data.targets()
    #     scaler = StandardScaler().fit(train_targets)
    #     scaled_targets = scaler.transform(train_targets).tolist()
    #     train_data.set_targets(scaled_targets)
    # else:
    #     scaler = None

    # Get loss and metric functions
    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric)

    # Set up test set evaluation
    # test_smiles, test_targets = test_data.smiles(), test_data.targets()
    # if args.dataset_type == 'multiclass':
    #     sum_test_preds = np.zeros((len(test_smiles), args.num_tasks, args.multiclass_num_classes))
    # else:
    #     sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    # Automatically determine whether to cache
    if len(data) <= args.cache_cutoff:
        cache = True
        num_workers = 0
    else:
        cache = False
        num_workers = args.num_workers

    # Set up MetaTaskDataLoaders, which takes care of task splits under the hood 
    # Set up task splits into T_tr, T_val, T_test

    assert args.chembl_assay_metadata_pickle_path is not None
    with open(args.chembl_assay_metadata_pickle_path +
            'chembl_128_assay_type_to_names.pickle', 'rb') as handle:
        chembl_128_assay_type_to_names = pickle.load(handle)
    with open(args.chembl_assay_metadata_pickle_path +
            'chembl_128_assay_name_to_type.pickle', 'rb') as handle:
        chembl_128_assay_name_to_type = pickle.load(handle)

    """ 
    Copy GSK implementation of task split 
    We have 5 Task types remaining
    ADME (A)
    Toxicity (T)
    Unassigned (U) 
    Binding (B)
    Functional (F)
    resulting in 902 tasks.

    For T_val, randomly select 10 B and F tasks
    For T_test, select another 10 B and F tasks and allocate all A, T, and U
    tasks to the test split.
    For T_train, allocate the remaining B and F tasks. 

    """
    import pdb; pdb.set_trace()
    T_val_num_BF_tasks = args.meta_split_sizes_BF[0]
    T_test_num_BF_tasks = args.meta_split_sizes_BF[1]
    T_val_idx = T_val_num_BF_tasks
    T_test_idx = T_val_num_BF_tasks + T_test_num_BF_tasks

    chembl_id_to_idx = {chembl_id: idx for idx, chembl_id in enumerate(args.task_names)}

    # Shuffle B and F tasks
    randomized_B_tasks = np.copy(chembl_128_assay_type_to_names['B'])
    np.random.shuffle(randomized_B_tasks)
    randomized_B_task_indices = [chembl_id_to_idx[assay] for assay in
            randomized_B_tasks]

    randomized_F_tasks = np.copy(chembl_128_assay_type_to_names['F'])
    np.random.shuffle(randomized_F_tasks)
    randomized_F_task_indices = [chembl_id_to_idx[assay] for assay in
            randomized_F_tasks]

    # Grab B and F indices for T_val
    T_val_B_task_indices = randomized_B_task_indices[:T_val_idx]
    T_val_F_task_indices = randomized_F_task_indices[:T_val_idx]

    # Grab B and F indices for T_test
    T_test_B_task_indices = randomized_B_task_indices[T_val_idx:T_test_idx]
    T_test_F_task_indices = randomized_F_task_indices[T_val_idx:T_test_idx]
    # Grab all A, T and U indices for T_test
    T_test_A_task_indices = [chembl_id_to_idx[assay] for assay in chembl_128_assay_type_to_names['A']]
    T_test_T_task_indices = [chembl_id_to_idx[assay] for assay in chembl_128_assay_type_to_names['T']]
    T_test_U_task_indices = [chembl_id_to_idx[assay] for assay in chembl_128_assay_type_to_names['U']]

    # Slot remaining BF tasks into T_tr
    T_tr_B_task_indices = randomized_B_task_indices[T_test_idx:]
    T_tr_F_task_indices = randomized_F_task_indices[T_test_idx:]

    T_tr = [0] * len(args.task_names)
    T_val = [0] * len(args.task_names)
    T_test = [0] * len(args.task_names)

    # Now make task bit vectors
    for idx_list in (T_tr_B_task_indices, T_tr_F_task_indices):
        for idx in idx_list:
            T_tr[idx] = 1

    for idx_list in (T_val_B_task_indices, T_val_F_task_indices):
        for idx in idx_list:
            T_val[idx] = 1

    for idx_list in (T_test_B_task_indices, T_test_F_task_indices, T_test_A_task_indices, T_test_T_task_indices, T_test_U_task_indices):
        for idx in idx_list:
            T_test[idx] = 1


    """
    Random task split for testing
    task_indices = list(range(len(args.task_names)))
    np.random.shuffle(task_indices)
    train_task_split, val_task_split, test_task_split = 0.9, 0, 0.1
    train_task_cutoff = int(len(task_indices) * train_task_split)
    train_task_idxs, test_task_idxs = [0] * len(task_indices), [0] * len(task_indices)
    for idx in task_indices[:train_task_cutoff]:
        train_task_idxs[idx] = 1
    for idx in task_indices[train_task_cutoff:]:
        test_task_idxs[idx] = 1
    """

    train_meta_task_data_loader = MetaTaskDataLoader(
            dataset=data,
            tasks=T_tr,
            sizes=args.meta_train_split_sizes,
            args=args,
            logger=logger)

    val_meta_task_data_loader = MetaTaskDataLoader(
            dataset=data,
            tasks=T_val,
            sizes=args.meta_test_split_sizes,
            args=args,
            logger=logger)

    test_meta_task_data_loader = MetaTaskDataLoader(
            dataset=data,
            tasks=T_test,
            sizes=args.meta_test_split_sizes,
            args=args,
            logger=logger)

    import pdb; pdb.set_trace()
    for meta_train_batch in train_meta_task_data_loader.tasks():
        for train_task in meta_train_batch:
            print('In inner loop')
            continue

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        makedirs(save_dir)
        try:
            writer = SummaryWriter(log_dir=save_dir)
        except:
            writer = SummaryWriter(logdir=save_dir)

        # Load/build model
        if args.checkpoint_paths is not None:
            debug(f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}')
            model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger)
        else:
            debug(f'Building model {model_idx}')
            model = MoleculeModel(args)

        debug(model)
        debug(f'Number of parameters = {param_count(model):,}')
        if args.cuda:
            debug('Moving model to cuda')
        model = model.to(args.device)

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug(f'Epoch {epoch}')

            n_iter = train(
                model=model,
                data_loader=train_data_loader,
                loss_func=loss_func,
                optimizer=optimizer,
                scheduler=scheduler,
                args=args,
                n_iter=n_iter,
                logger=logger,
                writer=writer
            )
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(
                model=model,
                data_loader=val_data_loader,
                num_tasks=args.num_tasks,
                metric_func=metric_func,
                dataset_type=args.dataset_type,
                scaler=scaler,
                logger=logger
            )

            # Average validation score
            avg_val_score = np.nanmean(val_scores)
            debug(f'Validation {args.metric} = {avg_val_score:.6f}')
            writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter)

            if args.show_individual_scores:
                # Individual validation scores
                for task_name, val_score in zip(args.task_names, val_scores):
                    debug(f'Validation {task_name} {args.metric} = {val_score:.6f}')
                    writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter)

            # Save model checkpoint if improved validation score
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args)        

        # Evaluate on test set using model with best validation score
        info(f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}')
        model = load_checkpoint(os.path.join(save_dir, 'model.pt'), device=args.device, logger=logger)
        
        test_preds = predict(
            model=model,
            data_loader=test_data_loader,
            scaler=scaler
        )
        test_scores = evaluate_predictions(
            preds=test_preds,
            targets=test_targets,
            num_tasks=args.num_tasks,
            metric_func=metric_func,
            dataset_type=args.dataset_type,
            logger=logger
        )

        if len(test_preds) != 0:
            sum_test_preds += np.array(test_preds)

        # Average test score
        avg_test_score = np.nanmean(test_scores)
        info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}')
        writer.add_scalar(f'test_{args.metric}', avg_test_score, 0)

        if args.show_individual_scores:
            # Individual test scores
            for task_name, test_score in zip(args.task_names, test_scores):
                info(f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}')
                writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter)
        writer.close()

    # Evaluate ensemble on test set
    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    ensemble_scores = evaluate_predictions(
        preds=avg_test_preds,
        targets=test_targets,
        num_tasks=args.num_tasks,
        metric_func=metric_func,
        dataset_type=args.dataset_type,
        logger=logger
    )

    # Average ensemble score
    avg_ensemble_test_score = np.nanmean(ensemble_scores)
    info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}')

    # Individual ensemble scores
    if args.show_individual_scores:
        for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
            info(f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}')

    return ensemble_scores
コード例 #2
0
def run_training(args: Namespace, logger: Logger = None) -> List[float]:
    """
    Trains a model and returns test scores on the model checkpoint with the highest validation score.

    :param args: Arguments.
    :param logger: Logger.
    :return: A list of ensemble scores for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Set GPU
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    # Print command line
    debug('Command line')
    debug(args.command_line)

    # Print args
    debug('Args')
    debug(pformat(vars(args)))

    # Get data
    debug('Loading data')
    args.task_names = get_task_names(args.data_path)
    data = get_data(path=args.data_path, args=args, logger=logger)
    args.num_tasks = data.num_tasks()
    args.features_size = data.features_size()
    debug(f'Number of tasks = {args.num_tasks}')

    # Split data
    debug(f'Splitting data with seed {args.seed}')
    if args.separate_test_path:
        test_data = get_data(path=args.separate_test_path,
                             args=args,
                             features_path=args.separate_test_features_path,
                             logger=logger)
    if args.separate_val_path:
        val_data = get_data(path=args.separate_val_path,
                            args=args,
                            features_path=args.separate_val_features_path,
                            logger=logger)

    if args.separate_val_path and args.separate_test_path:
        train_data = data
    elif args.separate_val_path:
        train_data, _, test_data = split_data(data=data,
                                              split_type=args.split_type,
                                              sizes=(0.8, 0.0, 0.2),
                                              seed=args.seed,
                                              args=args,
                                              logger=logger)
    elif args.separate_test_path:
        train_data, val_data, _ = split_data(data=data,
                                             split_type=args.split_type,
                                             sizes=(0.8, 0.2, 0.0),
                                             seed=args.seed,
                                             args=args,
                                             logger=logger)
    else:
        train_data, val_data, test_data = split_data(
            data=data,
            split_type=args.split_type,
            sizes=args.split_sizes,
            seed=args.seed,
            args=args,
            logger=logger)

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(
                f'{args.task_names[i]} '
                f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}'
            )

    if args.save_smiles_splits:
        with open(args.data_path, 'r') as f:
            reader = csv.reader(f)
            header = next(reader)

            lines_by_smiles = {}
            indices_by_smiles = {}
            for i, line in enumerate(reader):
                smiles = line[0]
                lines_by_smiles[smiles] = line
                indices_by_smiles[smiles] = i

        all_split_indices = []
        for dataset, name in [(train_data, 'train'), (val_data, 'val'),
                              (test_data, 'test')]:
            with open(os.path.join(args.save_dir, name + '_smiles.csv'),
                      'w') as f:
                writer = csv.writer(f)
                writer.writerow(['smiles'])
                for smiles in dataset.smiles():
                    writer.writerow([smiles])
            with open(os.path.join(args.save_dir, name + '_full.csv'),
                      'w') as f:
                writer = csv.writer(f)
                writer.writerow(header)
                for smiles in dataset.smiles():
                    writer.writerow(lines_by_smiles[smiles])
            split_indices = []
            for smiles in dataset.smiles():
                split_indices.append(indices_by_smiles[smiles])
                split_indices = sorted(split_indices)
            all_split_indices.append(split_indices)
        with open(os.path.join(args.save_dir, 'split_indices.pckl'),
                  'wb') as f:
            pickle.dump(all_split_indices, f)

    if args.features_scaling:
        features_scaler = train_data.normalize_features(replace_nan_token=0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    args.train_data_size = len(train_data)

    debug(
        f'Total size = {len(data):,} | '
        f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}'
    )

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        train_smiles, train_targets = train_data.smiles(), train_data.targets()
        scaler = StandardScaler().fit(train_targets)
        scaled_targets = scaler.transform(train_targets).tolist()
        train_data.set_targets(scaled_targets)
    else:
        scaler = None

    # Get loss and metric functions
    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric)

    # Set up test set evaluation
    test_smiles, test_targets, test_weights = test_data.smiles(
    ), test_data.targets(), test_data.weights()
    if args.dataset_type == 'multiclass':
        sum_test_preds = np.zeros(
            (len(test_smiles), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        makedirs(save_dir)
        try:
            writer = SummaryWriter(log_dir=save_dir)
        except:
            writer = SummaryWriter(logdir=save_dir)
        # Load/build model
        if args.checkpoint_paths is not None:
            debug(
                f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}'
            )
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    current_args=args,
                                    logger=logger)
        else:
            debug(f'Building model {model_idx}')
            model = build_model(args)

        debug(model)
        debug(f'Number of parameters = {param_count(model):,}')
        if args.cuda:
            debug('Moving model to cuda')
            model = model.cuda()

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler,
                        features_scaler, args)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug(f'Epoch {epoch}')

            n_iter = train(model=model,
                           data=train_data,
                           loss_func=loss_func,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           args=args,
                           n_iter=n_iter,
                           logger=logger,
                           writer=writer)
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(
                model=model,
                data=val_data,
                num_tasks=args.num_tasks,
                metric_func=metric_func,
                #                metric_func=loss_func,
                batch_size=args.batch_size,
                dataset_type=args.dataset_type,
                scaler=scaler,
                logger=logger)

            # Average validation score
            avg_val_score = np.nanmean(val_scores)
            debug(f'Validation {args.metric} = {avg_val_score:.6f}')
            writer.add_scalar(f'validation_{args.metric}', avg_val_score,
                              n_iter)

            if args.show_individual_scores:
                # Individual validation scores
                for task_name, val_score in zip(args.task_names, val_scores):
                    debug(
                        f'Validation {task_name} {args.metric} = {val_score:.6f}'
                    )
                    writer.add_scalar(f'validation_{task_name}_{args.metric}',
                                      val_score, n_iter)

            # Save model checkpoint if improved validation score
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir, 'model.pt'), model,
                                scaler, features_scaler, args)

        # Evaluate on test set using model with best validation score
        info(
            f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
        )
        model = load_checkpoint(os.path.join(save_dir, 'model.pt'),
                                cuda=args.cuda,
                                logger=logger)

        test_preds = predict(model=model,
                             data=test_data,
                             batch_size=args.batch_size,
                             scaler=scaler)
        test_scores = evaluate_predictions(
            preds=test_preds,
            targets=test_targets,
            weights=test_weights,
            num_tasks=args.num_tasks,
            metric_func=metric_func,
            #            metric_func=loss_func,
            dataset_type=args.dataset_type,
            logger=logger)

        if len(test_preds) != 0:
            sum_test_preds += np.array(test_preds)

        # Average test score
        avg_test_score = np.nanmean(test_scores)
        info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}')
        writer.add_scalar(f'test_{args.metric}', avg_test_score, 0)

        if args.show_individual_scores:
            # Individual test scores
            for task_name, test_score in zip(args.task_names, test_scores):
                info(
                    f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}'
                )
                writer.add_scalar(f'test_{task_name}_{args.metric}',
                                  test_score, n_iter)

    # Evaluate ensemble on test set
    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    ensemble_scores = evaluate_predictions(
        preds=avg_test_preds,
        targets=test_targets,
        weights=test_weights,
        num_tasks=args.num_tasks,
        metric_func=metric_func,
        #        metric_func=loss_func,
        dataset_type=args.dataset_type,
        logger=logger)

    # Average ensemble score
    avg_ensemble_test_score = np.nanmean(ensemble_scores)
    info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}')
    writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score,
                      0)

    # Individual ensemble scores
    if args.show_individual_scores:
        for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
            info(
                f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}'
            )

    return ensemble_scores
コード例 #3
0
ファイル: run_training.py プロジェクト: txie-93/chemprop
def run_training(args: TrainArgs,
                 data: MoleculeDataset,
                 logger: Logger = None) -> Dict[str, List[float]]:
    """
    Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score.

    :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for
                 loading data and training the Chemprop model.
    :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data.
    :param logger: A logger to record output.
    :return: A dictionary mapping each metric in :code:`args.metrics` to a list of values for each task.

    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Set pytorch seed for random initial weights
    torch.manual_seed(args.pytorch_seed)

    # Split data
    debug(f'Splitting data with seed {args.seed}')
    if args.separate_test_path:
        test_data = get_data(path=args.separate_test_path,
                             args=args,
                             features_path=args.separate_test_features_path,
                             logger=logger)
    if args.separate_val_path:
        val_data = get_data(path=args.separate_val_path,
                            args=args,
                            features_path=args.separate_val_features_path,
                            logger=logger)

    if args.separate_val_path and args.separate_test_path:
        train_data = data
    elif args.separate_val_path:
        train_data, _, test_data = split_data(data=data,
                                              split_type=args.split_type,
                                              sizes=(0.8, 0.0, 0.2),
                                              seed=args.seed,
                                              num_folds=args.num_folds,
                                              args=args,
                                              logger=logger)
    elif args.separate_test_path:
        train_data, val_data, _ = split_data(data=data,
                                             split_type=args.split_type,
                                             sizes=(0.8, 0.2, 0.0),
                                             seed=args.seed,
                                             num_folds=args.num_folds,
                                             args=args,
                                             logger=logger)
    else:
        train_data, val_data, test_data = split_data(
            data=data,
            split_type=args.split_type,
            sizes=args.split_sizes,
            seed=args.seed,
            num_folds=args.num_folds,
            args=args,
            logger=logger)

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(
                f'{args.task_names[i]} '
                f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}'
            )

    if args.save_smiles_splits:
        assert len(args.smiles_columns) == 1
        save_smiles_splits(data_path=args.data_path,
                           save_dir=args.save_dir,
                           train_data=train_data,
                           val_data=val_data,
                           test_data=test_data,
                           smiles_column=args.smiles_columns[0])

    if args.features_scaling:
        features_scaler = train_data.normalize_features(replace_nan_token=0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    args.train_data_size = len(train_data)

    debug(
        f'Total size = {len(data):,} | '
        f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}'
    )

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        scaler = train_data.normalize_targets()
    else:
        scaler = None

    # Get loss function
    loss_func = get_loss_func(args)

    # Set up test set evaluation
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    if args.dataset_type == 'multiclass':
        sum_test_preds = np.zeros(
            (len(test_smiles), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    # Automatically determine whether to cache
    if len(data) <= args.cache_cutoff:
        set_cache_graph(True)
        num_workers = 0
    else:
        set_cache_graph(False)
        num_workers = args.num_workers

    # Create data loaders
    train_data_loader = MoleculeDataLoader(dataset=train_data,
                                           batch_size=args.batch_size,
                                           num_workers=num_workers,
                                           class_balance=args.class_balance,
                                           shuffle=True,
                                           seed=args.seed)
    val_data_loader = MoleculeDataLoader(dataset=val_data,
                                         batch_size=args.batch_size,
                                         num_workers=num_workers)
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=num_workers)

    if args.class_balance:
        debug(
            f'With class_balance, effective train size = {train_data_loader.iter_size:,}'
        )

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        makedirs(save_dir)
        try:
            writer = SummaryWriter(log_dir=save_dir)
        except:
            writer = SummaryWriter(logdir=save_dir)

        # Load/build model
        if args.checkpoint_paths is not None:
            debug(
                f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}'
            )
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    logger=logger,
                                    cur_args=args)
        else:
            debug(f'Building model {model_idx}')
            model = MoleculeModel(args)

        debug(model)
        debug(f'Number of parameters = {param_count(model):,}')
        if args.cuda:
            debug('Moving model to cuda')
        model = model.to(args.device)

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler,
                        features_scaler, args)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug(f'Epoch {epoch}')

            n_iter = train(model=model,
                           data_loader=train_data_loader,
                           loss_func=loss_func,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           args=args,
                           n_iter=n_iter,
                           logger=logger,
                           writer=writer)
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(model=model,
                                  data_loader=val_data_loader,
                                  num_tasks=args.num_tasks,
                                  metrics=args.metrics,
                                  dataset_type=args.dataset_type,
                                  scaler=scaler,
                                  logger=logger)

            for metric, scores in val_scores.items():
                # Average validation score
                avg_val_score = np.nanmean(scores)
                debug(f'Validation {metric} = {avg_val_score:.6f}')
                writer.add_scalar(f'validation_{metric}', avg_val_score,
                                  n_iter)

                if args.show_individual_scores:
                    # Individual validation scores
                    for task_name, val_score in zip(args.task_names, scores):
                        debug(
                            f'Validation {task_name} {metric} = {val_score:.6f}'
                        )
                        writer.add_scalar(f'validation_{task_name}_{metric}',
                                          val_score, n_iter)

            # Save model checkpoint if improved validation score
            avg_val_score = np.nanmean(val_scores[args.metric])
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model,
                                scaler, features_scaler, args)

        # Evaluate on test set using model with best validation score
        info(
            f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
        )
        model = load_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME),
                                device=args.device,
                                logger=logger)

        test_preds = predict(model=model,
                             data_loader=test_data_loader,
                             scaler=scaler)
        test_scores = evaluate_predictions(preds=test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metrics=args.metrics,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

        if len(test_preds) != 0:
            sum_test_preds += np.array(test_preds)

        # Average test score
        for metric, scores in test_scores.items():
            avg_test_score = np.nanmean(scores)
            info(f'Model {model_idx} test {metric} = {avg_test_score:.6f}')
            writer.add_scalar(f'test_{metric}', avg_test_score, 0)

            if args.show_individual_scores:
                # Individual test scores
                for task_name, test_score in zip(args.task_names, scores):
                    info(
                        f'Model {model_idx} test {task_name} {metric} = {test_score:.6f}'
                    )
                    writer.add_scalar(f'test_{task_name}_{metric}', test_score,
                                      n_iter)
        writer.close()

    # Evaluate ensemble on test set
    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    ensemble_scores = evaluate_predictions(preds=avg_test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metrics=args.metrics,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

    for metric, scores in ensemble_scores.items():
        # Average ensemble score
        avg_ensemble_test_score = np.nanmean(scores)
        info(f'Ensemble test {metric} = {avg_ensemble_test_score:.6f}')

        # Individual ensemble scores
        if args.show_individual_scores:
            for task_name, ensemble_score in zip(args.task_names, scores):
                info(
                    f'Ensemble test {task_name} {metric} = {ensemble_score:.6f}'
                )

    # Optionally save test preds
    if args.save_preds:
        test_preds_dataframe = pd.DataFrame(
            data={'smiles': test_data.smiles()})

        for i, task_name in enumerate(args.task_names):
            test_preds_dataframe[task_name] = [
                pred[i] for pred in avg_test_preds
            ]

        test_preds_dataframe.to_csv(os.path.join(args.save_dir,
                                                 'test_preds.csv'),
                                    index=False)

    return ensemble_scores
コード例 #4
0
def run_training(args: TrainArgs, logger: Logger = None) -> List[float]:
    """
    Trains a model and returns test scores on the model checkpoint with the highest validation score.

    :param args: Arguments.
    :param logger: Logger.
    :return: A list of ensemble scores for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Print command line
    debug('Command line')
    debug(f'python {" ".join(sys.argv)}')

    # Print args
    debug('Args')
    debug(args)

    # Save args
    args.save(os.path.join(args.save_dir, 'args.json'))

    # Set pytorch seed for random initial weights
    torch.manual_seed(args.pytorch_seed)

    # Get data
    debug('Loading data')
    args.task_names = args.target_columns or get_task_names(args.data_path)
    data = get_data(path=args.data_path, args=args, logger=logger)
    args.num_tasks = data.num_tasks()
    args.features_size = data.features_size()
    debug(f'Number of tasks = {args.num_tasks}')

    # Split data
    debug(f'Splitting data with seed {args.seed}')
    if args.separate_test_path:
        test_data = get_data(path=args.separate_test_path,
                             args=args,
                             features_path=args.separate_test_features_path,
                             logger=logger)
    if args.separate_val_path:
        val_data = get_data(path=args.separate_val_path,
                            args=args,
                            features_path=args.separate_val_features_path,
                            logger=logger)

    if args.separate_val_path and args.separate_test_path:
        train_data = data
    elif args.separate_val_path:
        train_data, _, test_data = split_data(data=data,
                                              split_type=args.split_type,
                                              sizes=(0.8, 0.0, 0.2),
                                              seed=args.seed,
                                              args=args,
                                              logger=logger)
    elif args.separate_test_path:
        train_data, val_data, _ = split_data(data=data,
                                             split_type=args.split_type,
                                             sizes=(0.8, 0.2, 0.0),
                                             seed=args.seed,
                                             args=args,
                                             logger=logger)
    else:
        train_data, val_data, test_data = split_data(
            data=data,
            split_type=args.split_type,
            sizes=args.split_sizes,
            seed=args.seed,
            args=args,
            logger=logger)

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(
                f'{args.task_names[i]} '
                f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}'
            )

    if args.save_smiles_splits:
        save_smiles_splits(train_data=train_data,
                           val_data=val_data,
                           test_data=test_data,
                           data_path=args.data_path,
                           save_dir=args.save_dir)

    if args.features_scaling:
        features_scaler = train_data.normalize_features(replace_nan_token=0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    args.train_data_size = len(train_data)
    args.val_data_size = len(val_data)
    args.test_data_size = len(test_data)

    debug(
        f'Total size = {len(data):,} | '
        f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}'
    )

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        train_smiles, train_targets = train_data.smiles(), train_data.targets()
        scaler = StandardScaler().fit(train_targets)
        scaled_targets = scaler.transform(train_targets).tolist()
        train_data.set_targets(scaled_targets)
    else:
        scaler = None

    # Get loss and metric functions
    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric)

    # Set up test set evaluation
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    if args.dataset_type == 'multiclass':
        sum_test_preds = np.zeros(
            (len(test_smiles), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    # Automatically determine whether to cache
    if len(data) <= args.cache_cutoff:
        cache = True
        num_workers = 0
    else:
        cache = False
        num_workers = args.num_workers

    # Create data loaders
    train_data_loader = MoleculeDataLoader(dataset=train_data,
                                           batch_size=args.batch_size,
                                           num_workers=num_workers,
                                           cache=cache,
                                           class_balance=args.class_balance,
                                           shuffle=True,
                                           seed=args.seed)
    val_data_loader = MoleculeDataLoader(dataset=val_data,
                                         batch_size=args.batch_size,
                                         num_workers=num_workers,
                                         cache=cache)
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=num_workers,
                                          cache=cache)

    # Only using UQ methods if we have to train an estimator
    if args.uncertainty == 'random_forest' or args.uncertainty == 'gaussian':
        uncertainty_estimator = uncertainty_estimator_builder(
            args.uncertainty)(args, train_data, scaler)
    else:
        uncertainty_estimator = None

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        makedirs(save_dir)
        try:
            writer = SummaryWriter(log_dir=save_dir)
        except:
            writer = SummaryWriter(logdir=save_dir)

        # Load/build model
        if args.checkpoint_paths is not None:
            debug(
                f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}'
            )
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    logger=logger)
        else:
            debug(f'Building model {model_idx}')
            model = MoleculeModel(args)

        debug(model)
        debug(f'Number of parameters = {param_count(model):,}')
        if args.cuda:
            debug('Moving model to cuda')
        model = model.to(args.device)

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler,
                        features_scaler, args)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug(f'Epoch {epoch}')

            n_iter = train(model=model,
                           data_loader=train_data_loader,
                           loss_func=loss_func,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           args=args,
                           n_iter=n_iter,
                           logger=logger,
                           writer=writer)
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(model=model,
                                  args=args,
                                  data_loader=val_data_loader,
                                  num_tasks=args.num_tasks,
                                  metric_func=metric_func,
                                  dataset_type=args.dataset_type,
                                  scaler=scaler,
                                  logger=logger)

            # Average validation score
            avg_val_score = np.nanmean(val_scores)
            debug(f'Validation {args.metric} = {avg_val_score:.6f}')
            writer.add_scalar(f'validation_{args.metric}', avg_val_score,
                              n_iter)

            if args.show_individual_scores:
                # Individual validation scores
                for task_name, val_score in zip(args.task_names, val_scores):
                    debug(
                        f'Validation {task_name} {args.metric} = {val_score:.6f}'
                    )
                    writer.add_scalar(f'validation_{task_name}_{args.metric}',
                                      val_score, n_iter)

            # Save model checkpoint if improved validation score
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir, 'model.pt'), model,
                                scaler, features_scaler, args)

        # Evaluate on test set using model with best validation score
        info(
            f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
        )
        model = load_checkpoint(os.path.join(save_dir, 'model.pt'),
                                device=args.device,
                                logger=logger)

        test_preds = predict(model=model,
                             data_loader=test_data_loader,
                             scaler=scaler)

        test_scores = evaluate_predictions(preds=test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metric_func=metric_func,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

        if len(test_preds) != 0:
            sum_test_preds += np.array(test_preds)

        # Average test score
        avg_test_score = np.nanmean(test_scores)
        info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}')
        writer.add_scalar(f'test_{args.metric}', avg_test_score, 0)

        if uncertainty_estimator is not None:
            uncertainty_estimator.process_model(model)

        if args.show_individual_scores:
            # Individual test scores
            for task_name, test_score in zip(args.task_names, test_scores):
                info(
                    f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}'
                )
                writer.add_scalar(f'test_{task_name}_{args.metric}',
                                  test_score, n_iter)

    # Evaluate ensemble on test set
    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    ensemble_scores = evaluate_predictions(preds=avg_test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metric_func=metric_func,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

    # Average ensemble score
    avg_ensemble_test_score = np.nanmean(ensemble_scores)
    info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}')
    writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score,
                      0)

    # Individual ensemble scores
    if args.show_individual_scores:
        for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
            info(
                f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}'
            )

    return ensemble_scores, uncertainty_estimator
def train_gp(
        model,
        train_data,
        val_data,
        num_workers,
        cache,
        metric_func,
        scaler,
        features_scaler,
        args,
        save_dir):
    
    
    # create data loaders for gp (allows different batch size)
    train_data_loader = MoleculeDataLoader(
        dataset=train_data,
        batch_size=args.batch_size_gp,
        num_workers=num_workers,
        cache=cache,
        class_balance=args.class_balance,
        shuffle=True,
        seed=args.seed
    )
    val_data_loader = MoleculeDataLoader(
        dataset=val_data,
        batch_size=args.batch_size_gp,
        num_workers=num_workers,
        cache=cache
    )
    
    # feature_extractor
    model.featurizer = True
    feature_extractor = model
    
    # inducing points
    inducing_points = initial_inducing_points(
        train_data_loader,
        feature_extractor,
        args
        )
    
    # GP layer
    gp_layer = GPLayer(inducing_points, args.num_tasks)
    
    # full DKL model
    model = copy.deepcopy(DKLMoleculeModel(feature_extractor, gp_layer))
    
    # likelihood
    # rank 0 restricts to diagonal matrix
    likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=12, rank=0)

    # model and likelihood to CUDA
    if args.cuda:
        model.cuda()
        likelihood.cuda()

    # loss object
    mll = gpytorch.mlls.VariationalELBO(likelihood, model.gp_layer, num_data=args.train_data_size)
    
    # optimizer
    params_list = [
        {'params': model.feature_extractor.parameters(), 'weight_decay': args.weight_decay_gp},
        {'params': model.gp_layer.hyperparameters()},
        {'params': model.gp_layer.variational_parameters()},
        {'params': likelihood.parameters()},
    ]    
    optimizer = torch.optim.Adam(params_list, lr = args.init_lr_gp)    
    
    # scheduler
    num_params = len(params_list)
    scheduler = NoamLR(
        optimizer=optimizer,
        warmup_epochs=[args.warmup_epochs_gp]*num_params,
        total_epochs=[args.noam_epochs_gp]*num_params,
        steps_per_epoch=args.train_data_size // args.batch_size_gp,
        init_lr=[args.init_lr_gp]*num_params,
        max_lr=[args.max_lr_gp]*num_params,
        final_lr=[args.final_lr_gp]*num_params)
        
    
    print("----------GP training----------")
    
    # training loop
    best_score = float('inf') if args.minimize_score else -float('inf')
    best_epoch, n_iter = 0, 0
    for epoch in range(args.epochs_gp):
        print(f'GP epoch {epoch}')
        
        if epoch == args.noam_epochs_gp:
            scheduler = scheduler_const([args.final_lr_gp])
    
        n_iter = train(
                model=model,
                data_loader=train_data_loader,
                loss_func=mll,
                optimizer=optimizer,
                scheduler=scheduler,
                args=args,
                n_iter=n_iter,
                gp_switch=True,
                likelihood = likelihood
            )
    
        val_scores = evaluate(
            model=model,
            data_loader=val_data_loader,
            args=args,
            num_tasks=args.num_tasks,
            metric_func=metric_func,
            dataset_type=args.dataset_type,
            scaler=scaler
        )

        # Average validation score
        avg_val_score = np.nanmean(val_scores)
        print(f'Validation {args.metric} = {avg_val_score:.6f}')
        wandb.log({"Validation MAE": avg_val_score})

        # Save model AND LIKELIHOOD checkpoint if improved validation score
        if args.minimize_score and avg_val_score < best_score or \
                not args.minimize_score and avg_val_score > best_score:
            best_score, best_epoch = avg_val_score, epoch
            save_checkpoint(os.path.join(save_dir, 'DKN_model.pt'), model, scaler, features_scaler, args)
            best_likelihood = copy.deepcopy(likelihood)
            
            
    # load model with best validation score
    # NOTE: TEMPLATE MUST BE NEWLY INSTANTIATED MODEL
    print(f'Loading model with best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}')
    model = load_checkpoint(os.path.join(save_dir, 'DKN_model.pt'), device=args.device, logger=None,
                            template = DKLMoleculeModel(MoleculeModel(args, featurizer=True), gp_layer))

    
    return model, best_likelihood
コード例 #6
0
def train_swag_pdts(model_core, train_data_loader, loss_func, scaler,
                    features_scaler, args, save_dir, batch_no):

    # define no_cov_mat from cov_mat
    if args.cov_mat:
        no_cov_mat = False
    else:
        no_cov_mat = True

    # instantiate SWAG model (wrapper)
    swag_model = SWAG(model_core,
                      args,
                      no_cov_mat,
                      args.max_num_models,
                      var_clamp=1e-30)

    ############## DEFINE COSINE OPTIMISER AND SCHEDULER ##############

    # define optimiser
    optimizer = torch.optim.SGD([{
        'params': model_core.encoder.parameters()
    }, {
        'params': model_core.ffn.parameters()
    }, {
        'params': model_core.log_noise,
        'lr': args.lr_swag / 5 / 25,
        'weight_decay': 0
    }],
                                lr=args.lr_swag / 25,
                                weight_decay=args.weight_decay_swag,
                                momentum=args.momentum_swag)

    # define scheduler
    num_param_groups = len(optimizer.param_groups)
    if batch_no == 0:
        scheduler = OneCycleLR(
            optimizer,
            max_lr=[args.lr_swag, args.lr_swag, args.lr_swag / 5],
            epochs=args.epochs_swag,
            steps_per_epoch=-(-args.train_data_size // args.batch_size),
            pct_start=5 / args.epochs_swag,
            anneal_strategy='cos',
            cycle_momentum=False,
            div_factor=25.0,
            final_div_factor=1 / 25)
    else:
        scheduler = scheduler_const([args.lr_swag])

    ###################################################################

    # freeze log noise
    for name, parameter in model_core.named_parameters():
        if name == 'log_noise':
            parameter.requires_grad = False

    print("----------SWAG training----------")

    # training loop
    n_iter = 0
    for epoch in range(args.epochs_swag):

        print(f'SWAG epoch {epoch}')

        loss_avg, n_iter = train(model=model_core,
                                 data_loader=train_data_loader,
                                 loss_func=loss_func,
                                 optimizer=optimizer,
                                 scheduler=scheduler,
                                 args=args,
                                 n_iter=n_iter)

        # SWAG update
        if (epoch >= args.burnin_swag) and (loss_avg < args.loss_threshold):
            swag_model.collect_model(model_core)
            print('***collection***')

    # save final swag model
    save_checkpoint(os.path.join(save_dir, f'model_{batch_no}.pt'), swag_model,
                    scaler, features_scaler, args)

    return swag_model
コード例 #7
0
ファイル: run_training.py プロジェクト: zsheldon-dev/chemprop
def run_training(args: Namespace, logger: Logger = None) -> List[float]:
    """
    Trains a model and returns test scores on the model checkpoint with the highest validation score.

    :param args: Arguments.
    :param logger: Logger.
    :return: A list of ensemble scores for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Set GPU
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    # Print args
    debug(pformat(vars(args)))

    # Get data
    debug('Loading data')
    args.task_names = get_task_names(args.data_path)
    desired_labels = get_desired_labels(args, args.task_names)
    data = get_data(path=args.data_path, args=args, logger=logger)
    args.num_tasks = data.num_tasks()
    args.features_size = data.features_size()
    args.real_num_tasks = args.num_tasks - args.features_size if args.predict_features else args.num_tasks
    debug(f'Number of tasks = {args.num_tasks}')

    if args.dataset_type == 'bert_pretraining':
        data.bert_init(args, logger)

    # Split data
    if args.dataset_type == 'regression_with_binning':  # Note: for now, binning based on whole dataset, not just training set
        data, bin_predictions, regression_data = data
        args.bin_predictions = bin_predictions
        debug(f'Splitting data with seed {args.seed}')
        train_data, _, _ = split_data(data=data,
                                      split_type=args.split_type,
                                      sizes=args.split_sizes,
                                      seed=args.seed,
                                      args=args,
                                      logger=logger)
        _, val_data, test_data = split_data(regression_data,
                                            split_type=args.split_type,
                                            sizes=args.split_sizes,
                                            seed=args.seed,
                                            args=args,
                                            logger=logger)
    else:
        debug(f'Splitting data with seed {args.seed}')
        if args.separate_test_set:
            test_data = get_data(path=args.separate_test_set,
                                 args=args,
                                 features_path=args.separate_test_set_features,
                                 logger=logger)
            if args.separate_val_set:
                val_data = get_data(
                    path=args.separate_val_set,
                    args=args,
                    features_path=args.separate_val_set_features,
                    logger=logger)
                train_data = data  # nothing to split; we already got our test and val sets
            else:
                train_data, val_data, _ = split_data(
                    data=data,
                    split_type=args.split_type,
                    sizes=(0.8, 0.2, 0.0),
                    seed=args.seed,
                    args=args,
                    logger=logger)
        else:
            train_data, val_data, test_data = split_data(
                data=data,
                split_type=args.split_type,
                sizes=args.split_sizes,
                seed=args.seed,
                args=args,
                logger=logger)

    # Optionally replace test data with train or val data
    if args.test_split == 'train':
        test_data = train_data
    elif args.test_split == 'val':
        test_data = val_data

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(
                f'{args.task_names[i]} '
                f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}'
            )

        if args.class_balance:
            train_class_sizes = get_class_sizes(train_data)
            class_batch_counts = torch.Tensor(
                train_class_sizes) * args.batch_size
            args.class_weights = 1 / torch.Tensor(class_batch_counts)

    if args.save_smiles_splits:
        with open(args.data_path, 'r') as f:
            reader = csv.reader(f)
            header = next(reader)

            lines_by_smiles = {}
            indices_by_smiles = {}
            for i, line in enumerate(reader):
                smiles = line[0]
                lines_by_smiles[smiles] = line
                indices_by_smiles[smiles] = i

        all_split_indices = []
        for dataset, name in [(train_data, 'train'), (val_data, 'val'),
                              (test_data, 'test')]:
            with open(os.path.join(args.save_dir, name + '_smiles.csv'),
                      'w') as f:
                writer = csv.writer(f)
                writer.writerow(['smiles'])
                for smiles in dataset.smiles():
                    writer.writerow([smiles])
            with open(os.path.join(args.save_dir, name + '_full.csv'),
                      'w') as f:
                writer = csv.writer(f)
                writer.writerow(header)
                for smiles in dataset.smiles():
                    writer.writerow(lines_by_smiles[smiles])
            split_indices = []
            for smiles in dataset.smiles():
                split_indices.append(indices_by_smiles[smiles])
                split_indices = sorted(split_indices)
            all_split_indices.append(split_indices)
        with open(os.path.join(args.save_dir, 'split_indices.pckl'),
                  'wb') as f:
            pickle.dump(all_split_indices, f)

    if args.features_scaling:
        features_scaler = train_data.normalize_features(
            replace_nan_token=None if args.predict_features else 0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    args.train_data_size = len(
        train_data
    ) if args.prespecified_chunk_dir is None else args.prespecified_chunks_max_examples_per_epoch

    if args.adversarial or args.moe:
        val_smiles, test_smiles = val_data.smiles(), test_data.smiles()

    debug(
        f'Total size = {len(data):,} | '
        f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}'
    )

    # Optionally truncate outlier values
    if args.truncate_outliers:
        print('Truncating outliers in train set')
        train_data = truncate_outliers(train_data)

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    if args.dataset_type == 'regression' and args.target_scaling:
        debug('Fitting scaler')
        train_smiles, train_targets = train_data.smiles(), train_data.targets()
        scaler = StandardScaler().fit(train_targets)
        scaled_targets = scaler.transform(train_targets).tolist()
        train_data.set_targets(scaled_targets)
    else:
        scaler = None

    if args.moe:
        train_data = cluster_split(train_data,
                                   args.num_sources,
                                   args.cluster_max_ratio,
                                   seed=args.cluster_split_seed,
                                   logger=logger)

    # Chunk training data if too large to load in memory all at once
    if args.num_chunks > 1:
        os.makedirs(args.chunk_temp_dir, exist_ok=True)
        train_paths = []
        if args.moe:
            chunked_sources = [td.chunk(args.num_chunks) for td in train_data]
            chunks = []
            for i in range(args.num_chunks):
                chunks.append([source[i] for source in chunked_sources])
        else:
            chunks = train_data.chunk(args.num_chunks)
        for i in range(args.num_chunks):
            chunk_path = os.path.join(args.chunk_temp_dir, str(i) + '.txt')
            memo_path = os.path.join(args.chunk_temp_dir,
                                     'memo' + str(i) + '.txt')
            with open(chunk_path, 'wb') as f:
                pickle.dump(chunks[i], f)
            train_paths.append((chunk_path, memo_path))
        train_data = train_paths

    # Get loss and metric functions
    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric, args=args)

    # Set up test set evaluation
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    if args.maml:  # TODO refactor
        test_targets = []
        for task_idx in range(len(data.data[0].targets)):
            _, task_test_data, _ = test_data.sample_maml_task(args, seed=0)
            test_targets += task_test_data.targets()

    if args.dataset_type == 'bert_pretraining':
        sum_test_preds = {
            'features':
            np.zeros((len(test_smiles), args.features_size))
            if args.features_size is not None else None,
            'vocab':
            np.zeros((len(test_targets['vocab']), args.vocab.output_size))
        }
    elif args.dataset_type == 'kernel':
        sum_test_preds = np.zeros((len(test_targets), args.num_tasks))
    else:
        sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    if args.maml:
        sum_test_preds = None  # annoying to determine exact size; will initialize later

    if args.dataset_type == 'bert_pretraining':
        # Only predict targets that are masked out
        test_targets['vocab'] = [
            target if mask == 0 else None
            for target, mask in zip(test_targets['vocab'], test_data.mask())
        ]

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        os.makedirs(save_dir, exist_ok=True)
        writer = SummaryWriter(log_dir=save_dir)

        # Load/build model
        if args.checkpoint_paths is not None:
            debug(
                f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}'
            )
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    current_args=args,
                                    logger=logger)
        else:
            debug(f'Building model {model_idx}')
            model = build_model(args)

        debug(model)
        debug(f'Number of parameters = {param_count(model):,}')
        if args.cuda:
            debug('Moving model to cuda')
            model = model.cuda()

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler,
                        features_scaler, args)

        if args.adjust_weight_decay:
            args.pnorm_target = compute_pnorm(model)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug(f'Epoch {epoch}')

            if args.prespecified_chunk_dir is not None:
                # load some different random chunks each epoch
                train_data, val_data = load_prespecified_chunks(args, logger)
                debug('Loaded prespecified chunks for epoch')

            if args.dataset_type == 'unsupervised':  # won't work with moe
                full_data = MoleculeDataset(train_data.data + val_data.data)
                generate_unsupervised_cluster_labels(
                    build_model(args), full_data,
                    args)  # cluster with a new random init
                model.create_ffn(
                    args
                )  # reset the ffn since we're changing targets-- we're just pretraining the encoder.
                optimizer.param_groups.pop()  # remove ffn parameters
                optimizer.add_param_group({
                    'params': model.ffn.parameters(),
                    'lr': args.init_lr[1],
                    'weight_decay': args.weight_decay[1]
                })
                if args.cuda:
                    model.ffn.cuda()

            if args.gradual_unfreezing:
                if epoch % args.epochs_per_unfreeze == 0:
                    unfroze_layer = model.unfreeze_next(
                    )  # consider just stopping early after we have nothing left to unfreeze?
                    if unfroze_layer:
                        debug('Unfroze last frozen layer')

            n_iter = train(model=model,
                           data=train_data,
                           loss_func=loss_func,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           args=args,
                           n_iter=n_iter,
                           logger=logger,
                           writer=writer,
                           chunk_names=(args.num_chunks > 1),
                           val_smiles=val_smiles if args.adversarial else None,
                           test_smiles=test_smiles
                           if args.adversarial or args.moe else None)
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(model=model,
                                  data=val_data,
                                  metric_func=metric_func,
                                  args=args,
                                  scaler=scaler,
                                  logger=logger)

            if args.dataset_type == 'bert_pretraining':
                if val_scores['features'] is not None:
                    debug(
                        f'Validation features rmse = {val_scores["features"]:.6f}'
                    )
                    writer.add_scalar('validation_features_rmse',
                                      val_scores['features'], n_iter)
                val_scores = [val_scores['vocab']]

            # Average validation score
            avg_val_score = np.nanmean(val_scores)
            debug(f'Validation {args.metric} = {avg_val_score:.6f}')
            writer.add_scalar(f'validation_{args.metric}', avg_val_score,
                              n_iter)

            if args.show_individual_scores:
                # Individual validation scores
                for task_name, val_score in zip(args.task_names, val_scores):
                    if task_name in desired_labels:
                        debug(
                            f'Validation {task_name} {args.metric} = {val_score:.6f}'
                        )
                        writer.add_scalar(
                            f'validation_{task_name}_{args.metric}', val_score,
                            n_iter)

            # Save model checkpoint if improved validation score, or always save it if unsupervised
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score or \
                    args.dataset_type == 'unsupervised':
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir, 'model.pt'), model,
                                scaler, features_scaler, args)

        if args.dataset_type == 'unsupervised':
            return [0]  # rest of this is meaningless when unsupervised

        # Evaluate on test set using model with best validation score
        info(
            f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
        )
        model = load_checkpoint(os.path.join(save_dir, 'model.pt'),
                                cuda=args.cuda,
                                logger=logger)

        if args.split_test_by_overlap_dataset is not None:
            overlap_data = get_data(path=args.split_test_by_overlap_dataset,
                                    logger=logger)
            overlap_smiles = set(overlap_data.smiles())
            test_data_intersect, test_data_nonintersect = [], []
            for d in test_data.data:
                if d.smiles in overlap_smiles:
                    test_data_intersect.append(d)
                else:
                    test_data_nonintersect.append(d)
            test_data_intersect, test_data_nonintersect = MoleculeDataset(
                test_data_intersect), MoleculeDataset(test_data_nonintersect)
            for name, td in [('Intersect', test_data_intersect),
                             ('Nonintersect', test_data_nonintersect)]:
                test_preds = predict(model=model,
                                     data=td,
                                     args=args,
                                     scaler=scaler,
                                     logger=logger)
                test_scores = evaluate_predictions(
                    preds=test_preds,
                    targets=td.targets(),
                    metric_func=metric_func,
                    dataset_type=args.dataset_type,
                    args=args,
                    logger=logger)
                avg_test_score = np.nanmean(test_scores)
                info(
                    f'Model {model_idx} test {args.metric} for {name} = {avg_test_score:.6f}'
                )

        if len(
                test_data
        ) == 0:  # just get some garbage results without crashing; in this case we didn't care anyway
            test_preds, test_scores = sum_test_preds, [
                0 for _ in range(len(args.task_names))
            ]
        else:
            test_preds = predict(model=model,
                                 data=test_data,
                                 args=args,
                                 scaler=scaler,
                                 logger=logger)
            test_scores = evaluate_predictions(preds=test_preds,
                                               targets=test_targets,
                                               metric_func=metric_func,
                                               dataset_type=args.dataset_type,
                                               args=args,
                                               logger=logger)

        if args.maml:
            if sum_test_preds is None:
                sum_test_preds = np.zeros(np.array(test_preds).shape)

        if args.dataset_type == 'bert_pretraining':
            if test_preds['features'] is not None:
                sum_test_preds['features'] += np.array(test_preds['features'])
            sum_test_preds['vocab'] += np.array(test_preds['vocab'])
        else:
            sum_test_preds += np.array(test_preds)

        if args.dataset_type == 'bert_pretraining':
            if test_preds['features'] is not None:
                debug(
                    f'Model {model_idx} test features rmse = {test_scores["features"]:.6f}'
                )
                writer.add_scalar('test_features_rmse',
                                  test_scores['features'], 0)
            test_scores = [test_scores['vocab']]

        # Average test score
        avg_test_score = np.nanmean(test_scores)
        info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}')
        writer.add_scalar(f'test_{args.metric}', avg_test_score, 0)

        if args.show_individual_scores:
            # Individual test scores
            for task_name, test_score in zip(args.task_names, test_scores):
                if task_name in desired_labels:
                    info(
                        f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}'
                    )
                    writer.add_scalar(f'test_{task_name}_{args.metric}',
                                      test_score, n_iter)

    # Evaluate ensemble on test set
    if args.dataset_type == 'bert_pretraining':
        avg_test_preds = {
            'features':
            (sum_test_preds['features'] / args.ensemble_size).tolist()
            if sum_test_preds['features'] is not None else None,
            'vocab': (sum_test_preds['vocab'] / args.ensemble_size).tolist()
        }
    else:
        avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    if len(test_data
           ) == 0:  # just return some garbage when we didn't want test data
        ensemble_scores = test_scores
    else:
        ensemble_scores = evaluate_predictions(preds=avg_test_preds,
                                               targets=test_targets,
                                               metric_func=metric_func,
                                               dataset_type=args.dataset_type,
                                               args=args,
                                               logger=logger)

    # Average ensemble score
    if args.dataset_type == 'bert_pretraining':
        if ensemble_scores['features'] is not None:
            info(
                f'Ensemble test features rmse = {ensemble_scores["features"]:.6f}'
            )
            writer.add_scalar('ensemble_test_features_rmse',
                              ensemble_scores['features'], 0)
        ensemble_scores = [ensemble_scores['vocab']]

    avg_ensemble_test_score = np.nanmean(ensemble_scores)
    info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}')
    writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score,
                      0)

    # Individual ensemble scores
    if args.show_individual_scores:
        for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
            info(
                f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}'
            )

    return ensemble_scores
def train_sgld_pdts(model, train_data_loader, loss_func, scaler,
                    features_scaler, args, save_dir, batch_no):

    save_dir_sgld = os.path.join(save_dir, f'model_{batch_no}')
    makedirs(save_dir_sgld)

    # number of sgld epochs
    epochs_sgld = int(args.mix_epochs * args.samples)

    # freeze log noise
    for name, parameter in model.named_parameters():
        if name == 'log_noise':
            parameter.requires_grad = False

    print("----------SGLD training----------")

    # training loop
    n_iter = 0
    sample_idx = 0
    for epoch in range(epochs_sgld):

        ##### DEFINE OPTIMISER AND SCHEDULER ########################

        if epoch % args.mix_epochs == 0:
            print('\n********** resetting scheduler **********')

            optimizer = SGLD([{
                'params': model.encoder.parameters()
            }, {
                'params': model.ffn.parameters()
            }, {
                'params': model.log_noise,
                'lr': args.lr_max_sgld / 5 / 25,
                'addnoise': False
            }],
                             args,
                             lr=args.lr_max_sgld / 25,
                             weight_decay=args.weight_decay_sgld,
                             addnoise=True)

            num_param_groups = len(optimizer.param_groups)
            scheduler = OneCycleLR(
                optimizer,
                max_lr=[
                    args.lr_max_sgld, args.lr_max_sgld, args.lr_max_sgld / 5
                ],
                epochs=args.mix_epochs,
                steps_per_epoch=-(-args.train_data_size // args.batch_size),
                pct_start=0.2,
                anneal_strategy='cos',
                cycle_momentum=False,
                div_factor=25.0,
                final_div_factor=10000)

        #############################################################

        print(f'SGLD epoch {epoch}')

        n_iter = train(model=model,
                       data_loader=train_data_loader,
                       loss_func=loss_func,
                       optimizer=optimizer,
                       scheduler=scheduler,
                       args=args,
                       n_iter=n_iter)

        # collect model samples
        if (epoch + 1) % args.mix_epochs == 0:
            print(
                f'---------- collecting sgld sample {sample_idx} ----------\n')
            save_checkpoint(
                os.path.join(save_dir_sgld, f'model_{sample_idx}.pt'), model,
                scaler, features_scaler, args)
            sample_idx += 1

    return model