Exemple #1
0
def run_comparison(experiment_args: Namespace,
                   logger: logging.Logger,
                   features_dir: str = None):
    for dataset_name in experiment_args.datasets:
        dataset_type, dataset_path, num_folds, metric = DATASETS[dataset_name]
        logger.info(dataset_name)

        # Set up args
        args = deepcopy(experiment_args)
        args.data_path = dataset_path
        args.dataset_type = dataset_type
        args.save_dir = os.path.join(args.save_dir, dataset_name)
        args.num_folds = num_folds
        args.metric = metric
        if features_dir is not None:
            args.features_path = [
                os.path.join(features_dir, dataset_name + '.pckl')
            ]
        modify_train_args(args)

        # Set up logging for training
        os.makedirs(args.save_dir, exist_ok=True)
        fh = logging.FileHandler(os.path.join(args.save_dir, args.log_name))
        fh.setLevel(logging.DEBUG)

        # Cross validate
        TRAIN_LOGGER.addHandler(fh)
        mean_score, std_score = cross_validate(args, TRAIN_LOGGER)
        TRAIN_LOGGER.removeHandler(fh)

        # Record results
        logger.info('{} +/- {} {}'.format(mean_score, std_score, metric))
        temp_model = build_model(args)
        logger.info('num params: {:,}'.format(param_count(temp_model)))
    def objective(hyperparams: Dict[str, Union[int, float]],
                  seed: int) -> Dict:
        # Convert hyperparams from float to int when necessary
        for key in INT_KEYS:
            hyperparams[key] = int(hyperparams[key])

        # Copy args
        hyper_args = deepcopy(args)

        # Update args with hyperparams
        if args.save_dir is not None:
            folder_name = '_'.join(f'{key}_{value}'
                                   for key, value in hyperparams.items())
            hyper_args.save_dir = os.path.join(hyper_args.save_dir,
                                               folder_name)

        for key, value in hyperparams.items():
            setattr(hyper_args, key, value)

        hyper_args.ffn_hidden_size = hyper_args.hidden_size

        # Cross validate
        mean_score, std_score = cross_validate(args=hyper_args,
                                               train_func=run_training)

        # Record results
        temp_model = MoleculeModel(hyper_args)
        num_params = param_count(temp_model)
        logger.info(f'Trial results with seed {seed}')
        logger.info(hyperparams)
        logger.info(f'num params: {num_params:,}')
        logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}')

        # Deal with nan
        if np.isnan(mean_score):
            if hyper_args.dataset_type == 'classification':
                mean_score = 0
            else:
                raise ValueError(
                    'Can\'t handle nan score for non-classification dataset.')

        loss = (1 if hyper_args.minimize_score else -1) * mean_score

        return {
            'loss': loss,
            'status': 'ok',
            'mean_score': mean_score,
            'std_score': std_score,
            'hyperparams': hyperparams,
            'num_params': num_params,
            'seed': seed,
        }
    def objective(hyperparams: Dict[str, Union[int, float]]) -> float:
        # Convert hyperparams from float to int when necessary
        for key in INT_KEYS:
            hyperparams[key] = int(hyperparams[key])

        # Copy args
        hyper_args = deepcopy(args)

        # Update args with hyperparams
        if args.save_dir is not None:
            folder_name = "_".join(f"{key}_{value}"
                                   for key, value in hyperparams.items())
            hyper_args.save_dir = os.path.join(hyper_args.save_dir,
                                               folder_name)

        for key, value in hyperparams.items():
            setattr(hyper_args, key, value)

        hyper_args.ffn_hidden_size = hyper_args.hidden_size

        # Record hyperparameters
        logger.info(hyperparams)

        # Cross validate
        mean_score, std_score = cross_validate(args=hyper_args,
                                               train_func=run_training)

        # Record results
        temp_model = MoleculeModel(hyper_args)
        num_params = param_count(temp_model)
        logger.info(f"num params: {num_params:,}")
        logger.info(f"{mean_score} +/- {std_score} {hyper_args.metric}")

        results.append({
            "mean_score": mean_score,
            "std_score": std_score,
            "hyperparams": hyperparams,
            "num_params": num_params,
        })

        # Deal with nan
        if np.isnan(mean_score):
            if hyper_args.dataset_type == "classification":
                mean_score = 0
            else:
                raise ValueError(
                    "Can't handle nan score for non-classification dataset.")

        return (1 if hyper_args.minimize_score else -1) * mean_score
Exemple #4
0
def evaluate_glasses_dataset(args: Namespace, logger: Logger = None):

    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    debug(pformat(vars(args)))

    # Load data
    debug('Loading data')
    data = GlassDataset(args.test_data_path, transform=Compose([NNGraph(args.num_neighbors), Distance(False)]))

    # Dataset length
    data_length = len(data)
    debug('data size = {:,}'.format(data_length))

    data = DataLoader(data, args.batch_size)
    metric_func = get_metric_func(args.metric)

    for model_idx in range(args.ensemble_size):

        # Load/build model
        if args.checkpoint_paths is not None:
            debug('Loading model {} from {}'.format(model_idx, args.checkpoint_paths[model_idx]))
            model = load_checkpoint(args.checkpoint_paths[model_idx])
        else:
            debug('Must provide a checkpoint path!')
            exit(1)

        debug(model)
        debug('Number of parameters = {:,}'.format(param_count(model)))

        if args.cuda:
            debug('Moving model to cuda')
            model = model.cuda()

        test_scores = evaluate(
            model=model,
            data=data,
            metric_func=metric_func,
            args=args
        )

        # Average test score
        avg_test_score = np.mean(test_scores)
        info('Model {} test {} = {:.3f}'.format(model_idx, args.metric, avg_test_score))
    def objective(hyperparams: Dict[str, Union[int, float]]) -> float:

        # Convert hyperparams from float to int when necessary
        for key in INT_KEYS:
            hyperparams[key] = int(hyperparams[key])

        # Update args with hyperparams
        hyper_args = deepcopy(args)
        if args.save_dir is not None:
            folder_name = '_'.join(
                [f'{key}_{value}' for key, value in hyperparams.items()])
            hyper_args.save_dir = os.path.join(hyper_args.save_dir,
                                               folder_name)
        for key, value in hyperparams.items():
            setattr(hyper_args, key, value)

        logger.info(hyperparams)

        # Train
        avg_test_score, avg_test_accuracy = run_training(
            hyper_args, train_logger)

        # Record results
        temp_model = build_model(hyper_args)
        num_params = param_count(temp_model)
        logger.info(f'num params: {num_params:,}')
        logger.info(f'{avg_test_score} {hyper_args.metric}')
        logger.info(f'{avg_test_accuracy}' + ' accuracy')

        results.append({
            'avg_test_score': avg_test_score,
            'avg_test_accuracy': avg_test_accuracy,
            'hyperparams': hyperparams,
            'num_params': num_params
        })

        # Deal with nan
        if np.isnan(avg_test_score):
            if hyper_args.dataset_type == 'classification':
                avg_test_score = 0
            else:
                raise ValueError(
                    'Can\'t handle nan score for non-classification dataset.')

        return (1 if hyper_args.minimize_score else -1) * avg_test_score
Exemple #6
0
    def objective(hyperparams: Dict[str, Union[int, float]]) -> float:
        # Convert hyperparams from float to int when necessary
        for key in INT_KEYS:
            hyperparams[key] = int(hyperparams[key])

        # Update args with hyperparams
        hyper_args = deepcopy(args)
        if args.save_dir is not None:
            folder_name = '_'.join([
                f'{key}_{value}' if key in INT_KEYS else f'{key}_{value}'
                for key, value in hyperparams.items()
            ])
            hyper_args.save_dir = os.path.join(hyper_args.save_dir,
                                               folder_name)
        for key, value in hyperparams.items():
            setattr(hyper_args, key, value)

        # Record hyperparameters
        logger.info(hyperparams)

        # Cross validate
        mean_score, std_score = cross_validate(hyper_args, TRAIN_LOGGER)

        # Record results
        temp_model = build_model(hyper_args)
        num_params = param_count(temp_model)
        logger.info(f'num params: {num_params:,}')
        logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}')

        results.append({
            'mean_score': mean_score,
            'std_score': std_score,
            'hyperparams': hyperparams,
            'num_params': num_params
        })

        # Deal with nan
        if np.isnan(mean_score):
            if hyper_args.dataset_type == 'classification':
                mean_score = 0
            else:
                raise ValueError(
                    'Can\'t handle nan score for non-classification dataset.')

        return (1 if hyper_args.minimize_score else -1) * mean_score
Exemple #7
0
        def objective(hyperparams: Dict[str, Union[int, float]]) -> float:
            # Convert hyperparms from float to int when necessary
            for key in INT_KEYS:
                hyperparams[key] = int(hyperparams[key])

            # Copy args
            gs_args = deepcopy(dataset_args)

            for key, value in hyperparams.items():
                setattr(gs_args, key, value)

            # Record hyperparameters
            logger.info(hyperparams)

            # Cross validate
            mean_score, std_score = cross_validate(gs_args, TRAIN_LOGGER)

            # Record results
            temp_model = build_model(gs_args)
            num_params = param_count(temp_model)
            logger.info('num params: {:,}'.format(num_params))
            logger.info('{} +/- {} {}'.format(mean_score, std_score, metric))

            results.append({
                'mean_score': mean_score,
                'std_score': std_score,
                'hyperparams': hyperparams,
                'num_params': num_params
            })

            # Deal with nan
            if np.isnan(mean_score):
                if gs_args.dataset_type == 'classification':
                    mean_score = 0
                else:
                    raise ValueError(
                        'Can\'t handle nan score for non-classification dataset.'
                    )

            return (1 if gs_args.minimize_score else -1) * mean_score
Exemple #8
0
def run_training(args: Namespace, logger: Logger = None):
    """
    Trains a model and returns test scores on the model checkpoint with the highest validation score.
    :param args: args info
    :param logger: logger info
    :return: Optimal average test score (for use in hyperparameter optimization via Hyperopt)
    """

    # Set up logger
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    debug(pformat(vars(args)))

    # Load metadata
    metadata = json.load(open(args.data_path, 'r'))

    # Train/val/test split
    if args.k_fold_split:
        data_splits = []
        kf = KFold(n_splits=args.num_folds, shuffle=True, random_state=args.seed)
        for train_index, test_index in kf.split(metadata):
            splits = [train_index, test_index]
            data_splits.append(splits)
        data_splits = data_splits[args.fold_index]

        if args.use_inner_test:
            train_indices, remaining_indices = train_test_split(data_splits[0], test_size=args.val_test_size,
                                                                random_state=args.seed)
            validation_indices, test_indices = train_test_split(remaining_indices, test_size=0.5,
                                                                random_state=args.seed)

        else:
            train_indices = data_splits[0]
            validation_indices, test_indices = train_test_split(data_splits[1], test_size=0.5, random_state=args.seed)

        train_metadata = list(np.asarray(metadata)[list(train_indices)])
        validation_metadata = list(np.asarray(metadata)[list(validation_indices)])
        test_metadata = list(np.asarray(metadata)[list(test_indices)])

    else:
        train_metadata, remaining_metadata = train_test_split(metadata, test_size=args.val_test_size,
                                                              random_state=args.seed)
        validation_metadata, test_metadata = train_test_split(remaining_metadata, test_size=0.5, random_state=args.seed)

    # Load datasets
    debug('Loading data')
    transform = Compose([Augmentation(args.augmentation_length), NNGraph(args.num_neighbors), Distance(False)])
    train_data = GlassDataset(train_metadata, transform=transform)
    val_data = GlassDataset(validation_metadata, transform=transform)
    test_data = GlassDataset(test_metadata, transform=transform)
    args.atom_fdim = 3
    args.bond_fdim = args.atom_fdim + 1

    # Dataset lengths
    train_data_length, val_data_length, test_data_length = len(train_data), len(val_data), len(test_data)
    debug('train size = {:,} | val size = {:,} | test size = {:,}'.format(
        train_data_length,
        val_data_length,
        test_data_length)
    )

    # Convert to iterators
    train_data = DataLoader(train_data, args.batch_size)
    val_data = DataLoader(val_data, args.batch_size)
    test_data = DataLoader(test_data, args.batch_size)

    # Get loss and metric functions
    loss_func = get_loss_func(args)
    metric_func = get_metric_func(args.metric)

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, 'model_{}'.format(model_idx))
        os.makedirs(save_dir, exist_ok=True)
        writer = SummaryWriter(log_dir=save_dir)

        # Load/build model
        if args.checkpoint_paths is not None:
            debug('Loading model {} from {}'.format(model_idx, args.checkpoint_paths[model_idx]))
            model = load_checkpoint(args.checkpoint_paths[model_idx], args.save_dir, attention_viz=args.attention_viz)
        else:
            debug('Building model {}'.format(model_idx))
            model = build_model(args)

        debug(model)
        debug('Number of parameters = {:,}'.format(param_count(model)))

        if args.cuda:
            debug('Moving model to cuda')
            model = model.cuda()

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(model, args, os.path.join(save_dir, 'model.pt'))

        # Optimizer and learning rate scheduler
        optimizer = Adam(model.parameters(), lr=args.init_lr[model_idx], weight_decay=args.weight_decay[model_idx])

        scheduler = NoamLR(
            optimizer,
            warmup_epochs=args.warmup_epochs,
            total_epochs=[args.epochs],
            steps_per_epoch=train_data_length // args.batch_size,
            init_lr=args.init_lr,
            max_lr=args.max_lr,
            final_lr=args.final_lr
        )

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug('Epoch {}'.format(epoch))

            n_iter = train(
                model=model,
                data=train_data,
                loss_func=loss_func,
                optimizer=optimizer,
                scheduler=scheduler,
                args=args,
                n_iter=n_iter,
                logger=logger,
                writer=writer
            )

            val_scores = []
            for val_runs in range(args.num_val_runs):

                val_batch_scores = evaluate(
                    model=model,
                    data=val_data,
                    metric_func=metric_func,
                    args=args,
                )

                val_scores.append(np.mean(val_batch_scores))

            # Average validation score
            avg_val_score = np.mean(val_scores)
            debug('Validation {} = {:.3f}'.format(args.metric, avg_val_score))
            writer.add_scalar('validation_{}'.format(args.metric), avg_val_score, n_iter)

            # Save model checkpoint if improved validation score
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(model, args, os.path.join(save_dir, 'model.pt'))

        # Evaluate on test set using model with best validation score
        info('Model {} best validation {} = {:.3f} on epoch {}'.format(model_idx, args.metric, best_score, best_epoch))
        model = load_checkpoint(os.path.join(save_dir, 'model.pt'), args.save_dir, cuda=args.cuda,
                                attention_viz=args.attention_viz)

        test_scores = []
        for test_runs in range(args.num_test_runs):

            test_batch_scores = evaluate(
                model=model,
                data=test_data,
                metric_func=metric_func,
                args=args
            )

            test_scores.append(np.mean(test_batch_scores))

        # Get accuracy (assuming args.metric is set to AUC)
        metric_func_accuracy = get_metric_func('accuracy')
        test_scores_accuracy = []
        for test_runs in range(args.num_test_runs):

            test_batch_scores = evaluate(
                model=model,
                data=test_data,
                metric_func=metric_func_accuracy,
                args=args
            )

            test_scores_accuracy.append(np.mean(test_batch_scores))

        # Average test score
        avg_test_score = np.mean(test_scores)
        avg_test_accuracy = np.mean(test_scores_accuracy)
        info('Model {} test {} = {:.3f}, test {} = {:.3f}'.format(model_idx, args.metric,
                                                                  avg_test_score, 'accuracy', avg_test_accuracy))
        writer.add_scalar('test_{}'.format(args.metric), avg_test_score, n_iter)

        return avg_test_score, avg_test_accuracy  # For hyperparameter optimization or cross validation use
Exemple #9
0
def run_training(args: Namespace, logger: Logger = None) -> List[float]:
    """Trains a model and returns test scores on the model checkpoint with the highest validation score"""
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Set GPU
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    # Print args
    debug(pformat(vars(args)))

    # Get data
    debug('Loading data')
    args.task_names = get_task_names(args.data_path)
    desired_labels = get_desired_labels(args, args.task_names)
    data = get_data(args.data_path, args)
    args.num_tasks = data.num_tasks()
    args.features_size = data.features_size()
    args.real_num_tasks = args.num_tasks - args.features_size if args.predict_features else args.num_tasks
    debug('Number of tasks = {}'.format(args.num_tasks))

    if args.dataset_type == 'bert_pretraining':
        data.bert_init(args, logger)

    # Split data
    if args.dataset_type == 'regression_with_binning':  # Note: for now, binning based on whole dataset, not just training set
        data, bin_predictions, regression_data = data
        args.bin_predictions = bin_predictions
        debug('Splitting data with seed {}'.format(args.seed))
        train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
        _, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
    else:
        debug('Splitting data with seed {}'.format(args.seed))
        if args.separate_test_set:
            test_data = get_data(args.separate_test_set, args)
            if args.separate_val_set:
                val_data = get_data(args.separate_val_set, args)
                train_data = data  # nothing to split; we already got our test and val sets
            else:
                train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
        else:
            train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)

    # Optionally replace test data with train or val data
    if args.test_split == 'train':
        test_data = train_data
    elif args.test_split == 'val':
        test_data = val_data

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug('{} '.format(args.task_names[i]) +
                  ', '.join('{}: {:.2f}%'.format(cls, size * 100) for cls, size in enumerate(task_class_sizes)))

        if args.class_balance:
            train_class_sizes = get_class_sizes(train_data)
            class_batch_counts = torch.Tensor(train_class_sizes) * args.batch_size
            args.class_weights = 1 / torch.Tensor(class_batch_counts)

    if args.save_smiles_splits:
        with open(args.data_path, 'r') as f:
            header = f.readline().strip()
            lines_by_smiles = {}
            indices_by_smiles = {}
            for i, line in enumerate(f):
                line = line.strip()
                smiles = line.split(',')[0]
                lines_by_smiles[smiles] = line
                indices_by_smiles[smiles] = i

        all_split_indices = []
        for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]:
            with open(os.path.join(args.save_dir, name + '_smiles.csv'), 'w') as f:
                f.write('smiles\n')
                for smiles in dataset.smiles():
                    f.write(smiles.strip() + '\n')
            with open(os.path.join(args.save_dir, name + '_full.csv'), 'w') as f:
                f.write(header + '\n')
                for smiles in dataset.smiles():
                    f.write(lines_by_smiles[smiles] + '\n')
            split_indices = []
            for smiles in dataset.smiles():
                split_indices.append(indices_by_smiles[smiles])
                split_indices = sorted(split_indices)
            all_split_indices.append(split_indices)
        with open(os.path.join(args.save_dir, 'split_indices.pckl'), 'wb') as f:
            pickle.dump(all_split_indices, f)

    if args.features_scaling:
        features_scaler = train_data.normalize_features(replace_nan_token=None if args.predict_features else 0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    args.train_data_size = len(train_data) if args.prespecified_chunk_dir is None else args.prespecified_chunks_max_examples_per_epoch

    if args.adversarial or args.moe:
        val_smiles, test_smiles = val_data.smiles(), test_data.smiles()
    
    debug('Total size = {:,} | train size = {:,} | val size = {:,} | test size = {:,}'.format(
        len(data), len(train_data), len(val_data), len(test_data)))

    # Optionally truncate outlier values
    if args.truncate_outliers:
        print('Truncating outliers in train set')
        train_data = truncate_outliers(train_data)

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    if args.dataset_type == 'regression' and args.target_scaling:
        debug('Fitting scaler')
        train_smiles, train_targets = train_data.smiles(), train_data.targets()
        scaler = StandardScaler().fit(train_targets)
        scaled_targets = scaler.transform(train_targets).tolist()
        train_data.set_targets(scaled_targets)
    else:
        scaler = None

    if args.moe:
        train_data = cluster_split(train_data, 
                                   args.num_sources, 
                                   args.cluster_max_ratio, 
                                   seed=args.cluster_split_seed, 
                                   logger=logger)

    # Chunk training data if too large to load in memory all at once
    if args.num_chunks > 1:
        os.makedirs(args.chunk_temp_dir, exist_ok=True)
        train_paths = []
        if args.moe:
            chunked_sources = [td.chunk(args.num_chunks) for td in train_data]
            chunks = []
            for i in range(args.num_chunks):
                chunks.append([source[i] for source in chunked_sources])
        else:
            chunks = train_data.chunk(args.num_chunks)
        for i in range(args.num_chunks):
            chunk_path = os.path.join(args.chunk_temp_dir, str(i) + '.txt')
            memo_path = os.path.join(args.chunk_temp_dir, 'memo' + str(i) + '.txt')
            with open(chunk_path, 'wb') as f:
                pickle.dump(chunks[i], f)
            train_paths.append((chunk_path, memo_path))
        train_data = train_paths

    # Get loss and metric functions
    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric, args=args)

    # Set up test set evaluation
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    if args.maml:  # TODO refactor
        test_targets = []
        for task_idx in range(len(data.data[0].targets)):
            _, task_test_data, _ = test_data.sample_maml_task(args, seed=0)
            test_targets += task_test_data.targets()

    if args.dataset_type == 'bert_pretraining':
        sum_test_preds = {
            'features': np.zeros((len(test_smiles), args.features_size)) if args.features_size is not None else None,
            'vocab': np.zeros((len(test_targets['vocab']), args.vocab.output_size))
        }
    elif args.dataset_type == 'kernel':
        sum_test_preds = np.zeros((len(test_targets), args.num_tasks))
    else:
        sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    if args.maml:
        sum_test_preds = None  # annoying to determine exact size; will initialize later

    if args.dataset_type == 'bert_pretraining':
        # Only predict targets that are masked out
        test_targets['vocab'] = [target if mask == 0 else None for target, mask in zip(test_targets['vocab'], test_data.mask())]

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, 'model_{}'.format(model_idx))
        os.makedirs(save_dir, exist_ok=True)
        writer = SummaryWriter(log_dir=save_dir)

        # Load/build model
        if args.checkpoint_paths is not None:
            debug('Loading model {} from {}'.format(model_idx, args.checkpoint_paths[model_idx]))
            model = load_checkpoint(args.checkpoint_paths[model_idx], current_args=args, logger=logger)
        else:
            debug('Building model {}'.format(model_idx))
            model = build_model(args)

        debug(model)
        debug('Number of parameters = {:,}'.format(param_count(model)))
        if args.cuda:
            debug('Moving model to cuda')
            model = model.cuda()

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args)

        if args.adjust_weight_decay:
            args.pnorm_target = compute_pnorm(model)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug('Epoch {}'.format(epoch))

            if args.prespecified_chunk_dir is not None:
                # load some different random chunks each epoch
                train_data, val_data = load_prespecified_chunks(args, logger)
                debug('Loaded prespecified chunks for epoch')

            if args.dataset_type == 'unsupervised':  # won't work with moe
                full_data = MoleculeDataset(train_data.data + val_data.data)
                generate_unsupervised_cluster_labels(build_model(args), full_data, args)  # cluster with a new random init
                model.create_ffn(args)  # reset the ffn since we're changing targets-- we're just pretraining the encoder.
                optimizer.param_groups.pop()  # remove ffn parameters
                optimizer.add_param_group({'params': model.ffn.parameters(), 'lr': args.init_lr[1], 'weight_decay': args.weight_decay[1]})
                if args.cuda:
                    model.ffn.cuda()
            
            if args.gradual_unfreezing:
                if epoch % args.epochs_per_unfreeze == 0:
                    unfroze_layer = model.unfreeze_next()  # consider just stopping early after we have nothing left to unfreeze?
                    if unfroze_layer:
                        debug('Unfroze last frozen layer')

            n_iter = train(
                model=model,
                data=train_data,
                loss_func=loss_func,
                optimizer=optimizer,
                scheduler=scheduler,
                args=args,
                n_iter=n_iter,
                logger=logger,
                writer=writer,
                chunk_names=(args.num_chunks > 1),
                val_smiles=val_smiles if args.adversarial else None,
                test_smiles=test_smiles if args.adversarial or args.moe else None
            )
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(
                model=model,
                data=val_data,
                metric_func=metric_func,
                args=args,
                scaler=scaler,
                logger=logger
            )

            if args.dataset_type == 'bert_pretraining':
                if val_scores['features'] is not None:
                    debug('Validation features rmse = {:.6f}'.format(val_scores['features']))
                    writer.add_scalar('validation_features_rmse', val_scores['features'], n_iter)
                val_scores = [val_scores['vocab']]

            # Average validation score
            avg_val_score = np.nanmean(val_scores)
            debug('Validation {} = {:.6f}'.format(args.metric, avg_val_score))
            writer.add_scalar('validation_{}'.format(args.metric), avg_val_score, n_iter)

            if args.show_individual_scores:
                # Individual validation scores
                for task_name, val_score in zip(args.task_names, val_scores):
                    if task_name in desired_labels:
                        debug('Validation {} {} = {:.6f}'.format(task_name, args.metric, val_score))
                        writer.add_scalar('validation_{}_{}'.format(task_name, args.metric), val_score, n_iter)

            # Save model checkpoint if improved validation score, or always save it if unsupervised
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score or \
                    args.dataset_type == 'unsupervised':
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args)

        if args.dataset_type == 'unsupervised':
            return [0]  # rest of this is meaningless when unsupervised            

        # Evaluate on test set using model with best validation score
        info('Model {} best validation {} = {:.6f} on epoch {}'.format(model_idx, args.metric, best_score, best_epoch))
        model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger)

        if args.split_test_by_overlap_dataset is not None:
            overlap_data = get_data(args.split_test_by_overlap_dataset)
            overlap_smiles = set(overlap_data.smiles())
            test_data_intersect, test_data_nonintersect = [], []
            for d in test_data.data:
                if d.smiles in overlap_smiles:
                    test_data_intersect.append(d)
                else:
                    test_data_nonintersect.append(d)
            test_data_intersect, test_data_nonintersect = MoleculeDataset(test_data_intersect), MoleculeDataset(test_data_nonintersect)
            for name, td in [('Intersect', test_data_intersect), ('Nonintersect', test_data_nonintersect)]:
                test_preds = predict(
                    model=model,
                    data=td,
                    args=args,
                    scaler=scaler,
                    logger=logger
                )
                test_scores = evaluate_predictions(
                    preds=test_preds,
                    targets=td.targets(),
                    metric_func=metric_func,
                    dataset_type=args.dataset_type,
                    args=args,
                    logger=logger
                )
                avg_test_score = np.nanmean(test_scores)
                info('Model {} test {} for {} = {:.6f}'.format(model_idx, args.metric, name, avg_test_score))
        
        if len(test_data) == 0:  # just get some garbage results without crashing; in this case we didn't care anyway
            test_preds, test_scores = sum_test_preds, [0 for _ in range(len(args.task_names))]
        else:
            test_preds = predict(
                model=model,
                data=test_data,
                args=args,
                scaler=scaler,
                logger=logger
            )
            test_scores = evaluate_predictions(
                preds=test_preds,
                targets=test_targets,
                metric_func=metric_func,
                dataset_type=args.dataset_type,
                args=args,
                logger=logger
            )

        if args.maml:
            if sum_test_preds is None:
                sum_test_preds = np.zeros(np.array(test_preds).shape)

        if args.dataset_type == 'bert_pretraining':
            if test_preds['features'] is not None:
                sum_test_preds['features'] += np.array(test_preds['features'])
            sum_test_preds['vocab'] += np.array(test_preds['vocab'])
        else:
            sum_test_preds += np.array(test_preds)

        if args.dataset_type == 'bert_pretraining':
            if test_preds['features'] is not None:
                debug('Model {} test features rmse = {:.6f}'.format(model_idx, test_scores['features']))
                writer.add_scalar('test_features_rmse', test_scores['features'], 0)
            test_scores = [test_scores['vocab']]

        # Average test score
        avg_test_score = np.nanmean(test_scores)
        info('Model {} test {} = {:.6f}'.format(model_idx, args.metric, avg_test_score))
        writer.add_scalar('test_{}'.format(args.metric), avg_test_score, 0)

        if args.show_individual_scores:
            # Individual test scores
            for task_name, test_score in zip(args.task_names, test_scores):
                if task_name in desired_labels:
                    info('Model {} test {} {} = {:.6f}'.format(model_idx, task_name, args.metric, test_score))
                    writer.add_scalar('test_{}_{}'.format(task_name, args.metric), test_score, n_iter)

    # Evaluate ensemble on test set
    if args.dataset_type == 'bert_pretraining':
        avg_test_preds = {
            'features': (sum_test_preds['features'] / args.ensemble_size).tolist() if sum_test_preds['features'] is not None else None,
            'vocab': (sum_test_preds['vocab'] / args.ensemble_size).tolist()
        }
    else:
        avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    if len(test_data) == 0:  # just return some garbage when we didn't want test data
        ensemble_scores = test_scores
    else:
        ensemble_scores = evaluate_predictions(
            preds=avg_test_preds,
            targets=test_targets,
            metric_func=metric_func,
            dataset_type=args.dataset_type,
            args=args,
            logger=logger
        )

    # Average ensemble score
    if args.dataset_type == 'bert_pretraining':
        if ensemble_scores['features'] is not None:
            info('Ensemble test features rmse = {:.6f}'.format(ensemble_scores['features']))
            writer.add_scalar('ensemble_test_features_rmse', ensemble_scores['features'], 0)
        ensemble_scores = [ensemble_scores['vocab']]

    avg_ensemble_test_score = np.nanmean(ensemble_scores)
    info('Ensemble test {} = {:.6f}'.format(args.metric, avg_ensemble_test_score))
    writer.add_scalar('ensemble_test_{}'.format(args.metric), avg_ensemble_test_score, 0)

    # Individual ensemble scores
    if args.show_individual_scores:
        for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
            info('Ensemble test {} {} = {:.6f}'.format(task_name, args.metric, ensemble_score))

    return ensemble_scores
Exemple #10
0
def run_evaluation(args: Namespace, logger: Logger = None):
    """
    Evaluates a saved model
    :param args: Set of args
    :param logger: Logger saved in save_dir
    """

    # Set up logger
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    debug(pformat(vars(args)))

    # Load metadata
    metadata = json.load(open(args.data_path, 'r'))

    # Train/val/test split
    train_metadata, remaining_metadata = train_test_split(metadata,
                                                          test_size=0.3,
                                                          random_state=0)
    validation_metadata, test_metadata = train_test_split(remaining_metadata,
                                                          test_size=0.5,
                                                          random_state=0)

    # Load data
    debug('Loading data')

    transform = Compose([
        Augmentation(args.augmentation_length),
        NNGraph(args.num_neighbors),
        Distance(False)
    ])
    test_data = GlassDataset(test_metadata, transform=transform)
    args.atom_fdim = 3
    args.bond_fdim = args.atom_fdim + 1

    # Dataset lengths
    test_data_length = len(test_data)
    debug('test size = {:,}'.format(test_data_length))

    # Convert to iterators
    test_data = DataLoader(test_data, args.batch_size)

    # Get loss and metric functions
    metric_func = get_metric_func(args.metric)

    # Test ensemble of models
    for model_idx in range(args.ensemble_size):

        # Load/build model
        if args.checkpoint_paths is not None:
            debug('Loading model {} from {}'.format(
                model_idx, args.checkpoint_paths[model_idx]))
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    args.save_dir,
                                    cuda=args.cuda,
                                    attention_viz=args.attention_viz)
        else:
            debug('Must specify a model to load')
            exit(1)

        debug(model)
        debug('Number of parameters = {:,}'.format(param_count(model)))

        # Evaluate on test set using model with best validation score
        test_scores = []
        for test_runs in range(args.num_test_runs):

            test_batch_scores = evaluate(model=model,
                                         data=test_data,
                                         metric_func=metric_func,
                                         args=args)

            test_scores.append(np.mean(test_batch_scores))

        # Average test score
        avg_test_score = np.mean(test_scores)
        info('Model {} test {} = {:.3f}'.format(model_idx, args.metric,
                                                avg_test_score))