Ejemplo n.º 1
0
    def test_create_deepnull_prediction(self, target_is_binary):
        size = 1000
        design_df = _create_df(target_is_binary=target_is_binary, size=size)
        design_df['FID'] = np.arange(size)
        design_df['IID'] = np.arange(size)
        design_df['unused_str_column'] = np.random.choice(
            list('abcdefg'), size)

        input_df = design_df.copy(deep=True)
        with tempfile.TemporaryDirectory() as tmpdir:
            final_df, _, _, test_perf_df = train_eval.create_deepnull_prediction(
                input_df=input_df,
                target='label',
                target_is_binary=target_is_binary,
                covariates=['cov1', 'cov2'],
                prediction_column='label_deepnull',
                num_folds=3,
                model_params=model_lib.ModelParameters(batch_size=32,
                                                       mlp_units=[32, 16],
                                                       num_epochs=1),
                logdir=tmpdir,
                verbosity=0)

        pd.testing.assert_frame_equal(design_df, input_df)
        self.assertCountEqual(final_df.columns,
                              list(design_df.columns) + ['label_deepnull'])
        pd.testing.assert_frame_equal(design_df, final_df[design_df.columns])
        self.assertCountEqual(
            test_perf_df.columns,
            ['IID', 'label', 'label_deepnull', 'label_deepnull_eval_fold'])
Ejemplo n.º 2
0
    def test_predict(self, target_is_binary):
        design_df = _create_df(target_is_binary=target_is_binary, size=1000)
        train_df = design_df.iloc[:600]
        eval_df = design_df.iloc[600:800]
        test_df = design_df.iloc[800:].copy(deep=True)
        test_df['IID'] = np.arange(len(test_df))

        with tempfile.TemporaryDirectory() as tmpdir:
            model, _, _ = train_eval.train_deepnull_model(
                train_df=train_df,
                eval_df=eval_df,
                target='label',
                target_is_binary=target_is_binary,
                covariates=['cov1', 'cov2'],
                model_params=model_lib.ModelParameters(batch_size=32,
                                                       mlp_units=[32, 16],
                                                       num_epochs=1),
                logdir=tmpdir,
                verbosity=0,
            )

        actual = train_eval.predict(deepnull_model=model,
                                    df=test_df,
                                    covariates=['cov1', 'cov2'],
                                    prediction_column='deepnull_pred')
        self.assertCountEqual(actual.columns, ['IID', 'deepnull_pred'])
        pd.testing.assert_series_equal(actual.IID, test_df.IID)
        self.assertEqual(actual.deepnull_pred.isnull().sum(), 0)
Ejemplo n.º 3
0
def main(argv: Sequence[str]) -> None:
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    tf.random.set_seed(_SEED.value)
    logging.info('Loading data from %s', _INPUT_TSV.value)
    input_df, binary_col_map = data.load_plink_or_bolt_file(
        path_or_buf=_INPUT_TSV.value, missing_value=_MISSING_VALUE.value)

    if _NUM_EPOCHS.value is None:
        model_params = model_lib.ModelParameters()
    else:
        model_params = model_lib.ModelParameters(num_epochs=_NUM_EPOCHS.value)

    logging.info('Training DeepNull model on %s with model %s', _TARGET.value,
                 model_params)
    final_df, _, eval_metrics, _ = train_eval.create_deepnull_prediction(
        input_df=input_df,
        target=_TARGET.value,
        target_is_binary=_TARGET.value in binary_col_map,
        covariates=_COVARIATES.value,
        prediction_column=_PREDS_COL.value,
        num_folds=_NUM_FOLDS.value,
        model_params=model_params,
        seed=_SEED.value,
        logdir=_LOGDIR.value,
        # Level 2 is printing once per epoch during training.
        verbosity=2 if _VERBOSE.value else 0,
    )

    if not metrics.acceptable_model_performance(eval_metrics):
        logging.warning(
            'WARNING: data folds have substantially different performance. Consider'
            ' retraining model with a different seed.')

    logging.info('Writing trained results to %s', _OUTPUT_TSV.value)
    data.write_plink_or_bolt_file(input_df=final_df,
                                  path_or_buf=_OUTPUT_TSV.value,
                                  binary_column_mapping=binary_col_map,
                                  missing_value=_MISSING_VALUE.value,
                                  cast_ints=True)
Ejemplo n.º 4
0
    def test_train_binary_deepnull_model(self, target_is_binary,
                                         expected_metrics):
        size = 1000
        design_df = _create_df(target_is_binary=target_is_binary, size=size)
        train_df = design_df.iloc[:800]
        eval_df = design_df.iloc[800:]

        with tempfile.TemporaryDirectory() as tmpdir:
            model, history, eval_performance = train_eval.train_deepnull_model(
                train_df=train_df,
                eval_df=eval_df,
                target='label',
                target_is_binary=target_is_binary,
                covariates=['cov1', 'cov2'],
                model_params=model_lib.ModelParameters(batch_size=32,
                                                       mlp_units=[32, 16],
                                                       num_epochs=2),
                logdir=tmpdir,
                verbosity=0,
            )

        self.assertIsInstance(model, tf.keras.models.Model)
        self.assertIsInstance(history, tf.keras.callbacks.History)
        self.assertCountEqual(eval_performance.keys(), expected_metrics)
Ejemplo n.º 5
0
def train_deepnull_model(
    train_df: pd.DataFrame,
    eval_df: pd.DataFrame,
    target: str,
    target_is_binary: bool,
    covariates: List[str],
    model_params: Optional[model_lib.ModelParameters] = None,
    logdir: str = '/tmp',
    fold_ix: int = 0,
    verbosity: int = 1,
) -> Tuple[tf.keras.models.Model, List[tf.keras.callbacks.History], Dict[
        str, float]]:
    """Returns the best trained DeepNull model.

  This function handles running DeepNull training over the `train_df` and model
  checkpoint selection based on performance within the `eval_df`. Both
  dataframes are required to have all of the following columns, with no empty
  values:
    * `target` - The target value to predict.
    * `covariates` - The list of covariates used to predict `target`.

  Args:
    train_df: The input data used for model training.
    eval_df: The data used for model evaluation / optimal checkpoint selection.
    target: The target value to predict.
    target_is_binary: If True, models as a binary outcome. Otherwise models a
      quantitative outcome.
    covariates: The list of covariates used to predict the target.
    model_params: Parameters of the model to use.
    logdir: The log directory in which to write checkpoints.
    fold_ix: The fold index of the eval dataset. Only needed when training
      multiple folds of data so that logging parameters and checkpoints do not
      overwrite each other.
    verbosity: Level of verbosity when fitting the model. 0=silent, 1=progress,
      2=print once per epoch.

  Returns:
    A triplet of outputs: The first is a DataFrame with 'IID',
    `prediction_column` columns that represent the DeepNull predictions for all
    individuals in the `test_df`. The second is the history of fitting the model
    to the train and eval data. The third is the metrics on the validation set
    of the checkpoint used.
  """
    if model_params is None:
        model_params = model_lib.ModelParameters()

    train_input_fn = _make_input_fn(
        features_df=train_df[covariates],
        labels=train_df[target],
        num_epochs=None,  # Loop indefinitely.
        is_training=True,
        batch_size=model_params.batch_size)
    eval_input_fn = _make_input_fn(features_df=eval_df[covariates],
                                   labels=eval_df[target],
                                   num_epochs=1,
                                   is_training=False,
                                   batch_size=model_params.batch_size)
    feature_columns = [
        tf.feature_column.numeric_column(covariate_name, dtype=tf.float32)
        for covariate_name in covariates
    ]

    if target_is_binary:
        model_cls = model_lib.BinaryDeepNull
        optimization_metric = metrics.get_optimization_metric('crossentropy')
    else:
        model_cls = model_lib.QuantitativeDeepNull
        optimization_metric = metrics.get_optimization_metric('tf_pearson')
    deepnull_model = model_cls(feature_columns=feature_columns,
                               mlp_units=model_params.mlp_units,
                               mlp_activation=model_params.mlp_activation,
                               optimization_metric=optimization_metric)

    deepnull_model.compile(loss=deepnull_model.loss_function(),
                           optimizer=tf.keras.optimizers.Adam(
                               learning_rate=model_params.learning_rate,
                               beta_1=model_params.beta_1,
                               beta_2=model_params.beta_2,
                           ),
                           metrics=deepnull_model.metrics_to_use())

    best_checkpoint_path = os.path.join(logdir, 'ckpts',
                                        f'best-{fold_ix}.ckpt')
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            filepath=best_checkpoint_path,
            save_weights_only=True,
            monitor=deepnull_model.best_checkpoint_metric(),
            mode=deepnull_model.best_checkpoint_mode(),
            save_freq='epoch',
            save_best_only=True),
        tf.keras.callbacks.TensorBoard(
            log_dir=os.path.join(logdir, f'tb{fold_ix}')),
    ]

    steps_per_epoch = math.ceil(len(train_df) / model_params.batch_size)
    history = deepnull_model.fit(train_input_fn,
                                 validation_data=eval_input_fn,
                                 epochs=model_params.num_epochs,
                                 steps_per_epoch=steps_per_epoch,
                                 verbose=verbosity,
                                 callbacks=callbacks)
    # Load the best model weights back into the model.
    deepnull_model.load_weights(best_checkpoint_path)
    best_ckpt_validation = deepnull_model.evaluate(
        eval_input_fn,
        batch_size=model_params.batch_size,
        # Evaluate only supports silent or progress bar.
        verbose=min(verbosity, 1),
        return_dict=True)
    return deepnull_model, history, best_ckpt_validation
Ejemplo n.º 6
0
def create_deepnull_prediction(
    input_df: pd.DataFrame,
    target: str,
    target_is_binary: bool,
    covariates: List[str],
    prediction_column: Optional[str] = None,
    num_folds: int = 5,
    model_params: Optional[model_lib.ModelParameters] = None,
    seed: int = 318753108,  # From /dev/urandom.
    logdir: str = '/tmp',
    verbosity: int = 2,
) -> Tuple[pd.DataFrame, List[tf.keras.callbacks.History], List[Dict[
        str, float]], pd.DataFrame]:
    """Runs the entire DeepNull algorithm to add the prediction to the input data.

  This is the main entrypoint for training DeepNull. It handles splitting the
  input DataFrame into folds, training a separate DeepNull model on each fold,
  and concatenating all results into a final DataFrame that contains the
  predictions.

  Args:
    input_df: A dataframe representing the input data, as loaded by
      `load_plink_or_bolt_file` above.
    target:  The target value to predict using DeepNull.
    target_is_binary: True if and only if the target should be predicted as a
      binary value.
    covariates: The set of covariate values used to predict the target.
    prediction_column: The name of the output column to add to the dataframe.
    num_folds: The number of folds to split the data into. `num_folds` - 2 folds
      of data are used to train each DeepNull model, with evaluation of the best
      model occurring in one fold, and final predictions occurring in the final
      fold.
    model_params: Model parameters to use in training.
    seed: The random seed used to split data into training folds.
    logdir: The directory in which to write intermediate data.
    verbosity: Level of verbosity when fitting each constituent DeepNull model.
      0=silent, 1=progress, 2=print once per epoch.

  Returns:
    A tuple of four items:
      1. A pd.DataFrame with all input data plus a single new column,
         `prediction_column`, that includes the DeepNull prediction of the
         phenotype in rows for which it could be computed.
      2. A list containing `num_folds` histories, each holding the history of
         training the DeepNull model associated with that fold of data.
      3. A list containing `num_folds` dictionaries, each holding validation
         data performance metrics for each of the validation folds of data.
      4. A pd.DataFrame containing the true target value, DeepNull prediction,
         and data fold from which the test prediction was made.

  Raises:
    ValueError: The input dataframe is not able to be used to run DeepNull.
  """
    if prediction_column is None:
        prediction_column = f'{target}_deepnull'
    deepnull_fold_col = f'{target}_{_DEEPNULL_FOLD_COL_SUFFIX}'

    if prediction_column in input_df.columns:
        raise ValueError(
            f'Prediction column "{prediction_column}" present in input.')
    if deepnull_fold_col in input_df.columns:
        raise ValueError(
            f'Reserved column "{deepnull_fold_col}" present in input.')

    if model_params is None:
        # Use the default settings for all parameters.
        model_params = model_lib.ModelParameters()
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    # Sanity checks. Note that we assume the inputs have already been checked for
    # presence and uniqueness of IID, and missing data are represented as np.nan.
    if num_folds < 3:
        raise ValueError(f'Must specify at least 3 data folds: {num_folds}')
    if target not in input_df.columns:
        raise ValueError(f'Target {target} absent from df: {input_df.columns}')
    if any(cov not in set(input_df.columns) for cov in covariates):
        raise ValueError(
            f'One or more requested covariates ({covariates}) absent '
            f'from df: {input_df.columns}')
    if target in covariates:
        raise ValueError(
            f'Target value {target} cannot be present in covariates: {covariates}'
        )
    if len(covariates) != len(set(covariates)):
        raise ValueError(f'Duplicate covariates encountered: {covariates}')
    if prediction_column in input_df.columns:
        raise ValueError(f'Output column {prediction_column} already exists.')

    # Identify IIDs where we cannot predict because missingness exists in our
    # predictors or target.
    fields = ['IID', target] + covariates
    missing_mask = pd.isnull(input_df[fields]).any(axis=1)
    iids_with_missing = input_df.IID[missing_mask]

    trainable_df = input_df.loc[~input_df.IID.isin(iids_with_missing), fields]
    all_predictions = []
    histories = []
    validation_data_performance = []
    for fold, (train_df, eval_df, test_df) in enumerate(
            data.split_data_in_folds(trainable_df,
                                     num_folds=num_folds,
                                     seed=seed)):
        print(f'Beginning training for fold {fold} of {num_folds}...')
        best_model, history, val_perf = train_deepnull_model(
            train_df=train_df,
            eval_df=eval_df,
            target=target,
            target_is_binary=target_is_binary,
            covariates=covariates,
            model_params=model_params,
            logdir=logdir,
            fold_ix=fold,
            verbosity=verbosity,
        )
        fold_predictions = predict(deepnull_model=best_model,
                                   df=test_df,
                                   covariates=covariates,
                                   prediction_column=prediction_column)
        fold_predictions[deepnull_fold_col] = fold
        all_predictions.append(fold_predictions)
        histories.append(history)
        validation_data_performance.append(val_perf)

    preds_df = pd.concat(all_predictions, ignore_index=True)
    assert len(preds_df) == len(set(preds_df.IID))
    assert set(preds_df.IID) == set(trainable_df.IID)
    # Extract a dataframe that contains the true target value, the DeepNull
    # prediction, and the data fold from which the prediction came, for downstream
    # analysis.
    test_performance_df = pd.merge(preds_df,
                                   input_df[['IID', target]],
                                   on='IID',
                                   how='inner')
    # Return as the output to write a dataframe that contains all the rows of the
    # input DataFrame with just the prediction column added for samples where it
    # could be computed.
    final_df = pd.merge(input_df,
                        preds_df[['IID', prediction_column]],
                        on='IID',
                        how='left')
    return final_df, histories, validation_data_performance, test_performance_df