def create_scaffold_split(dset_key, res_dir):
    params = {
        "dataset_key": dset_key,
        "datastore": "False",
        "uncertainty": "False",
        "splitter": "scaffold",
        "split_valid_frac": "0.1",
        "split_test_frac": "0.1",
        "split_strategy": "train_valid_test",
        "previously_split": "False",
        "prediction_type": "classification",
        "model_choice_score_type": "roc_auc",
        "response_cols": "active",
        "id_col": "compound_id",
        "smiles_col": "base_rdkit_smiles",
        "result_dir": res_dir,
        "system": "LC",
        "transformers": "True",
        "model_type": "NN",
        "featurizer": "computed_descriptors",
        "descriptor_type": "rdkit_raw",
        "learning_rate": ".0007",
        "layer_sizes": "512,128",
        "dropouts": "0.3,0.3",
        "save_results": "False",
        "max_epochs": "500",
        "early_stopping_patience": "50",
        "verbose": "False"
    }

    pparams = parse.wrapper(params)
    MP = mp.ModelPipeline(pparams)

    split_uuid = MP.split_dataset()
    return split_uuid
Example #2
0
def train_model(input, output):
    """ Retrain a model saved in a model_metadata.json file

    Args:
        input (str): path to model_metadata.json file

        output (str): path to output directory

    Returns:
        None
    """
    # Train model
    # -----------
    # Read parameter JSON file
    with open(input) as f:
        config = json.loads(f.read())

    # Parse parameters
    params = parse.wrapper(config)
    params.result_dir = output
    # otherwise this will have the same uuid as the source model
    params.model_uuid = None
    # use the same split
    params.previously_split = True
    params.split_uuid = config['splitting_parameters']['split_uuid']

    logger.debug("model params %s" % str(params))

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    return model
def featurize_from_shortlist(shortlist_path=None, split_json=None):
    """
    Featurize and split the ChEMBL hERG pIC50 dataset. Then create a config
    file for running a hyperparameter search to model this dataset.
    """
    sl = pd.read_csv(shortlist_path)
    with open(split_json, "r") as f:
        hp_params = json.load(f)

    print('Featurizing shortlist')
    hp_params.pop('use_shortlist')
    hp_params.pop('shortlist_key')

    for i, row in sl.iterrows():
        hp_params['dataset_key'] = row.dataset_key
        hp_params['response_cols'] = row.response_cols
        pparams = parse.wrapper(hp_params)

        print('-----------------------------------------------')
        print(hp_params['dataset_key'])
        print(pparams.dataset_key)
        print('-----------------------------------------------')

        # Create a ModelPipeline object
        pipe = mp.ModelPipeline(pparams)

        # Featurize and split the dataset
        split_uuid = pipe.split_dataset()

        # Delete split file to keep it cleaner
        rdir = hp_params['result_dir']
        dkey = row.dataset_key.replace('.csv', '')
        os.remove(f'{dkey}_train_valid_test_scaffold_{split_uuid}.csv')
Example #4
0
def train_model_from_tracker(model_uuid, output_dir):
    """ Retrain a model saved in the model tracker, but save it to output_dir and don't insert it into the model tracker

    Args:
        model_uuid (str): model tracker model_uuid file

        output_dir (str): path to output directory

    Returns:
        the model pipeline object with trained model
    """

    if not mlmt_supported:
        logger.debug(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return None

    mlmt_client = dsf.initialize_model_tracker()

    collection_name = mt.get_model_collection_by_uuid(model_uuid,
                                                      mlmt_client=mlmt_client)

    # get metadata from tracker
    config = mt.get_metadata_by_uuid(model_uuid)

    # check if datastore dataset
    try:
        result = dsf.retrieve_dataset_by_datasetkey(
            config['training_dataset']['dataset_key'],
            bucket=config['training_dataset']['bucket'])
        if result is not None:
            config['datastore'] = True
    except:
        pass
    # fix weird old parameters
    #if config[]
    # Parse parameters
    params = parse.wrapper(config)
    params.result_dir = output_dir
    # otherwise this will have the same uuid as the source model
    params.model_uuid = None
    # use the same split
    params.previously_split = True
    params.split_uuid = config['splitting_parameters']['split_uuid']
    # specify collection
    params.collection_name = collection_name

    logger.debug("model params %s" % str(params))

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    return model
Example #5
0
def split(pparams):
    split_params = copy.copy(pparams)
    split_params.split_only=True
    split_params.previously_split=False

    model_pipeline = mp.ModelPipeline(split_params)
    # comment out this line after splitting once so you don't re-split
    split_uuid = model_pipeline.split_dataset()

    return split_uuid
Example #6
0
def delaney_pipeline(y=["measured log solubility in mols per litre"],
                     featurizer="ecfp",
                     split_strategy="train_valid_test",
                     splitter="random"):
    delaney_inp_file = currentdir + '/config_delaney.json'
    inp_params = parse.wrapper(delaney_inp_file)
    inp_params.response_cols = y
    inp_params.featurizer = featurizer
    inp_params.split_strategy = split_strategy
    inp_params.splitter = splitter
    mp = MP.ModelPipeline(inp_params)
    return mp
Example #7
0
def test():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    clean()

    # Run HyperOpt
    # ------------
    with open("H1_RF.json", "r") as f:
        hp_params = json.load(f)

    script_dir = parse.__file__.strip("parameter_parser.py").replace(
        "/pipeline/", "")
    python_path = sys.executable
    hp_params["script_dir"] = script_dir
    hp_params["python_path"] = python_path

    params = parse.wrapper(hp_params)
    if not os.path.isfile(params.dataset_key):
        params.dataset_key = os.path.join(params.script_dir,
                                          params.dataset_key)

    train_df = pd.read_csv(params.dataset_key)

    print(f"Train a RF models with ECFP")
    pl = mp.ModelPipeline(params)
    pl.train_model()

    print("Calculate AD index with the just trained model.")
    pred_df_mp = pl.predict_on_dataframe(train_df[:10],
                                         contains_responses=True,
                                         AD_method="z_score")

    assert (
        "AD_index"
        in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp'

    print("Calculate AD index with the saved model tarball file.")
    pred_df_file = pfm.predict_from_model_file(
        model_path=pl.params.model_tarball_path,
        input_df=train_df[:10],
        id_col="compound_id",
        smiles_col="rdkit_smiles",
        response_col="pKi_mean",
        dont_standardize=True,
        AD_method="z_score")
    assert ("AD_index" in pred_df_file.columns.values
            ), 'Error: No AD_index column in pred_df_file'
def train_model_w_balan(dset_key, split_uuid, res_dir):
    # Now train models on the same dataset with balancing weights
    params = {
        "dataset_key": dset_key,
        "datastore": "False",
        "uncertainty": "False",
        "splitter": "scaffold",
        "split_valid_frac": "0.1",
        "split_test_frac": "0.1",
        "split_strategy": "train_valid_test",
        "previously_split": "True",
        "split_uuid": split_uuid,
        "prediction_type": "classification",
        "model_choice_score_type": "roc_auc",
        "response_cols": "active",
        "id_col": "compound_id",
        "smiles_col": "base_rdkit_smiles",
        "result_dir": res_dir,
        "system": "LC",
        "transformers": "True",
        "model_type": "NN",
        "featurizer": "computed_descriptors",
        "descriptor_type": "rdkit_raw",
        "weight_transform_type": "balancing",
        "learning_rate": ".0007",
        "layer_sizes": "512,128",
        "dropouts": "0.3,0.3",
        "save_results": "False",
        "max_epochs": "500",
        "early_stopping_patience": "50",
        "verbose": "False"
    }

    for i in range(nreps):
        pparams = parse.wrapper(params)
        MP = mp.ModelPipeline(pparams)
        MP.train_model()
        wrapper = MP.model_wrapper

        for ss in ['valid', 'test']:
            metvals = wrapper.get_pred_results(ss, 'best')
            for metric in [
                    'roc_auc_score', 'prc_auc_score', 'cross_entropy',
                    'precision', 'recall_score', 'npv', 'accuracy_score',
                    'bal_accuracy', 'kappa', 'matthews_cc'
            ]:
                subset.append(ss)
                balanced.append('yes')
                metrics.append(metric)
                vals.append(metvals[metric])
Example #9
0
def test():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    clean()

    # Run HyperOpt
    # ------------
    with open("H1_hybrid.json", "r") as f:
        hp_params = json.load(f)

    script_dir = parse.__file__.strip("parameter_parser.py").replace(
        "/pipeline/", "")
    python_path = sys.executable
    hp_params["script_dir"] = script_dir
    hp_params["python_path"] = python_path

    params = parse.wrapper(hp_params)
    if not os.path.isfile(params.dataset_key):
        params.dataset_key = os.path.join(params.script_dir,
                                          params.dataset_key)

    train_df = pd.read_csv(params.dataset_key)

    print(f"Train a hybrid models with MOE descriptors")
    pl = mp.ModelPipeline(params)
    pl.train_model()

    print("Check the model performance on validation data")
    pred_data = pl.model_wrapper.get_perf_data(subset="valid",
                                               epoch_label="best")
    pred_results = pred_data.get_prediction_results()
    print(pred_results)

    pred_score = pred_results['r2_score']
    score_threshold = 0.4
    assert pred_score > score_threshold, \
        f'Error: Score is too low {pred_score}. Must be higher than {score_threshold}'

    print("Make predictions with the hyrid model")
    predict = pl.predict_on_dataframe(train_df[:10], contains_responses=False)
    assert (predict['pred'].shape[0] == 10
            ), 'Error: Incorrect number of predictions'
    assert (np.all(np.isfinite(
        predict['pred'].values))), 'Error: Predictions are not numbers'
Example #10
0
def train_and_get_tar(input_json, ds_key_file):

    script_path = os.path.dirname(os.path.realpath(__file__))
    json_file = os.path.join(script_path, input_json)

    pparams = parse.wrapper(['--config_file', json_file])
    pparams.dataset_key = os.path.join(script_path, ds_key_file)
    pparams.result_dir = os.path.join(script_path, 'result')

    train_pipe = mp.ModelPipeline(pparams)
    train_pipe.train_model()

    list_of_files = glob.glob('./result/*.gz')  # check all *.gz
    latest_file = max(list_of_files, key=os.path.getctime)  # get the latest gz

    return latest_file
Example #11
0
def train_model(input, output, dskey=''):
    """ Retrain a model saved in a model_metadata.json file

    Args:
        input (str): path to model_metadata.json file

        output (str): path to output directory

        dskey (str): new dataset key if file location has changed

    Returns:
        None
    """
    # Train model
    # -----------
    # Read parameter JSON file
    with open(input) as f:
        config = json.loads(f.read())

    # set a new dataset key if necessary
    if not dskey == '':
        config['dataset_key'] = dskey

    # Parse parameters
    params = parse.wrapper(config)
    params.result_dir = output
    # otherwise this will have the same uuid as the source model
    params.model_uuid = None
    # use the same split
    params.previously_split = True
    params.split_uuid = config['splitting_parameters']['split_uuid']

    # specify collection
    logger.debug("model params %s" % str(params))
    logger.debug(params.__dict__.items())

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    return model
Example #12
0
def train(pparams):
    train_pipe = mp.ModelPipeline(pparams)
    train_pipe.train_model()

    return train_pipe
Example #13
0
def test():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    integrative_utilities.clean_fit_predict()
    clean()

    # Download
    # --------
    download()

    # Curate
    # ------
    curate()

    # Train model
    # -----------
    # Read parameter JSON file
    with open('config_delaney_train_NN.json') as f:
        config = json.loads(f.read())

    # Parse parameters
    params = parse.wrapper(config)

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    # Get uuid and reload directory
    # -----------------------------
    uuid = integrative_utilities.get_subdirectory(
        'result/delaney-processed_curated_fit/NN_graphconv_scaffold_regression'
    )
    reload_dir = 'result/delaney-processed_curated_fit/NN_graphconv_scaffold_regression/' + uuid

    # Check training statistics
    # -------------------------
    integrative_utilities.training_statistics_file(reload_dir, 'test', 0.6)

    # Make prediction parameters
    # --------------------------
    # Read prediction parameter JSON file
    with open('config_delaney_predict_NN.json', 'r') as f:
        predict_parameters_dict = json.loads(f.read())

    # Set transformer key here because model uuid is not known before fit
    predict_parameters_dict[
        'transformer_key'] = reload_dir + 'transformers.pkl'

    predict_parameters = parse.wrapper(predict_parameters_dict)

    # Load second test set
    # --------------------
    data = pd.read_csv('delaney-processed_curated_external.csv')

    # Select columns and rename response column
    data = data[[
        predict_parameters.id_col, predict_parameters.smiles_col,
        predict_parameters.response_cols[0]
    ]]
    data = data.rename(
        columns={predict_parameters.response_cols[0]: 'experimental_values'})

    # Make prediction pipeline
    # ------------------------
    pp = mp.create_prediction_pipeline_from_file(predict_parameters,
                                                 reload_dir)

    # Predict
    # -------
    predict = pp.predict_on_dataframe(data)

    # Check predictions
    # -----------------
    assert (predict['pred'].shape[0] == 117
            ), 'Error: Incorrect number of predictions'
    assert (np.all(np.isfinite(
        predict['pred'].values))), 'Error: Predictions are not numbers'

    # Save predictions with experimental values
    # -----------------------------------------
    predict.reset_index(level=0, inplace=True)
    combined = pd.merge(data,
                        predict,
                        on=predict_parameters.id_col,
                        how='inner')
    combined.to_csv('delaney-processed_curated_predict.csv')
    assert (os.path.isfile('delaney-processed_curated_predict.csv')
            and os.path.getsize('delaney-processed_curated_predict.csv') > 0
            ), 'Error: Prediction file not created'
Example #14
0
def base_feature_importance(model_pipeline=None, params=None):
    """
    Minimal baseline feature importance function. Given an AMPL model (or the parameters to train a model),
    returns a data frame with a row for each feature. The columns of the data frame depend on the model type and
    prediction type. If the model is a binary classifier, the columns include  t-statistics and p-values
    for the differences between the means of the active and inactive compounds. If the model is a random forest,
    the columns will include the mean decrease in impurity (MDI) of each feature, computed by the scikit-learn
    feature_importances_ function. See the scikit-learn documentation for warnings about interpreting the MDI
    importance. For all models, the returned data frame will include feature names, means and standard deviations
    for each feature.

    This function has been tested on RFs and NNs with rdkit descriptors. Other models and feature combinations
    may not be supported.

    Args:
        model_pipeline (`ModelPipeline`): A pipeline object for a model that was trained in the current Python session
        or loaded from the model tracker or a tarball file. Either model_pipeline or params must be provided.

        params (`dict`): Parameter dictionary for a model to be trained and analyzed. Either model_pipeline or a
        params argument must be passed; if both are passed, params is ignored and the parameters from model_pipeline
        are used.

    Returns:
        (imp_df, model_pipeline, pparams) (tuple):
            imp_df (`DataFrame`): Table of feature importance metrics.
            model_pipeline (`ModelPipeline`): Pipeline object for model that was passed to or trained by function.
            pparams (`Namespace`): Parsed parameters of model.

    """
    log = logging.getLogger('ATOM')
    if model_pipeline is None:
        if params is None:
            raise ValueError(
                "Either model_pipeline or params can be None but not both")
        # Train a model based on the parameters given
        pparams = parse.wrapper(params)
        model_pipeline = mp.ModelPipeline(pparams)
        model_pipeline.train_model()
    else:
        if params is not None:
            log.info(
                "model_pipeline and params were both passed; ignoring params argument and using params from model"
            )
        pparams = model_pipeline.params

    # Get the list of feature column names
    features = model_pipeline.featurization.get_feature_columns()
    nfeat = len(features)
    imp_df = pd.DataFrame({'feature': features})

    # Get the training, validation and test sets (we assume we're not using K-fold CV). These are DeepChem Dataset objects.
    (train_dset, valid_dset) = model_pipeline.data.train_valid_dsets[0]
    test_dset = model_pipeline.data.test_dset

    imp_df['mean_value'] = train_dset.X.mean(axis=0)
    imp_df['std_value'] = train_dset.X.std(axis=0)

    if pparams.prediction_type == 'classification':
        # Compute a t-statistic for each feature for the difference between its mean values for active and inactive compounds
        tstats = []
        pvalues = []
        active = train_dset.X[train_dset.y[:, 0] == 1, :]
        inactive = train_dset.X[train_dset.y[:, 0] == 0, :]

        log.debug("Computing t-statistics")
        for ifeat in range(nfeat):
            res = stats.ttest_ind(active[:, ifeat],
                                  inactive[:, ifeat],
                                  equal_var=True,
                                  nan_policy='omit')
            tstats.append(res.statistic)
            pvalues.append(res.pvalue)
        imp_df['t_statistic'] = tstats
        imp_df['ttest_pvalue'] = pvalues

    if pparams.model_type == 'RF':
        # Tabulate the MDI-based feature importances for random forest models
        # TODO: Does this work for XGBoost models too?
        rf_model = model_pipeline.model_wrapper.model.model
        imp_df['mdi_importance'] = rf_model.feature_importances_

    return imp_df, model_pipeline, pparams
Example #15
0
def test_train_NN_graphconv_scaffold_inputs():
    """

    Args:
    pipeline (ModelPipeline): The ModelPipeline instance for this model run.
    
    Dependencies:
    ModelPipeline creation
    featurization creation
    creation of model_wrapper
    mp.load_featurize_data

    Calls:
    create_perf_data
    perf_data.accumulate_preds
    perf_data.comput_perf_metrics
    data.combined_training-data()
    self._copy_model
    """
    # checking that the layers, dropouts, and learning rate are properly added to the deepchem graphconv model
    general_params['featurizer'] = 'graphconv'
    general_params['layer_sizes'] = '100,100,10'
    general_params['dropouts'] = '0.3,0.3,0.1'
    general_params['uncertainty'] = False
    inp_params = parse.wrapper(general_params)
    mp = MP.ModelPipeline(inp_params)
    mp.featurization = feat.create_featurization(inp_params)
    mp.model_wrapper = model_wrapper.create_model_wrapper(
        inp_params, mp.featurization, mp.ds_client)
    # asserting that the correct model is created with the correct layer sizes, dropouts, model_dir, and mode by default
    test1 = []

    test1.append(mp.model_wrapper.params.layer_sizes == [100, 100, 10])
    test1.append(mp.model_wrapper.params.dropouts == [0.3, 0.3, 0.1])
    # checking that parameters are properly passed to the deepchem model object
    test1.append(isinstance(mp.model_wrapper.model, GraphConvModel))
    test1.append(
        mp.model_wrapper.model.model_dir == mp.model_wrapper.model_dir)
    test1.append(
        [i.out_channel
         for i in mp.model_wrapper.model.model.graph_convs] == [100, 100])
    test1.append(
        [i.rate
         for i in mp.model_wrapper.model.model.dropouts] == [0.3, 0.3, 0.1])
    test1.append(mp.model_wrapper.model.mode == 'regression')
    test1.append(mp.model_wrapper.model.model.dense.units == 10)
    assert all(test1)

    #***********************************************************************************
    def test_super_get_train_valid_pred_results():
        """
        Args:
        perf_data: A PerfData object that stores the predicted values and metrics
        Returns:
        dict: A dictionary of the prediction results

            Raises:
        None

        Dependencies:
        create_perf_data

        Calls:
        perf_data.get_prediction_results()

        """
        pass

    # should be tested in perf_data.get_prediction_results()
    # should still be called to make sure that the function is callable

    #***********************************************************************************
    def test_super_get_test_perf_data():
        """
        Args:
        model_dir (str): Directory where the saved model is stored
        model_dataset (DiskDataset): Stores the current dataset and related methods

        Returns:
        perf_data: PerfData object containing the predicted values and metrics for the current test dataset

            Raises:
        None

        Dependencies:
        A model must be in model_dir
        model_dataset.test_dset must exist

        Calls:
        create_perf_data
        self.generate_predictions
        perf_data.accumulate_preds
        """
        pass
        # mostly tested in accumulate_preds, but should be tested to ensure taht the predictions are properly being called

    #***********************************************************************************
    def test_super_get_test_pred_results():
        """
        Args:
        model_dir (str): Directory where the saved model is stored
        model_dataset (DiskDataset): Stores the current dataset and related methods

        Returns:
        dict: A dictionary containing the prediction values and metrics for the current dataset.

            Raises:
        None

        Dependencies:
        A model must be in model_dir
        model_dataset.test_dset must exist

        Calls:
        self.get_test_perf_data
        perf_data.get_prediction_results
        """
        pass
        #mostly tested in perf_data.get_prediction_results

    #***********************************************************************************
    def test_super_get_full_dataset_perf_data():
        """
        Args:
        model_dataset (DiskDataset): Stores the current dataset and related methods

        Returns:
        perf_data: PerfData object containing the predicted values and metrics for the current full dataset

            Raises:
        None

        Dependencies:
        A model must already be trained

        Calls:
        create_perf_data
        self.generate_predictions
        self.accumulate_preds
        """
        pass

    #***********************************************************************************
    def test_super_get_full_dataset_pred_results():
        """
        Args:
        model_dataset (DiskDataset): Stores the current dataset and related methods
        Returns:
        dict: A dictionary containing predicted values and metrics for the current full dataset

            Raises:
        None

        Dependencies:
        A model was already be trained.

        Calls:
        get_full_dataset_perf_data
        self.get_prediction_results()
        """
        pass
def train_and_predict(train_json_f, prefix='delaney-processed'):
    # Train model
    # -----------
    # Read parameter JSON file
    with open(train_json_f) as f:
        config = json.loads(f.read())

    # Parse parameters
    params = parse.wrapper(config)

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    # Get uuid and reload directory
    # -----------------------------
    model_type = params.model_type
    prediction_type = params.prediction_type
    descriptor_type = params.descriptor_type
    featurizer = params.featurizer
    splitter = params.splitter
    model_dir = 'result/%s_curated_fit/%s_%s_%s_%s' % (
        prefix, model_type, featurizer, splitter, prediction_type)
    uuid = model.params.model_uuid
    tar_f = 'result/%s_curated_fit_model_%s.tar.gz' % (prefix, uuid)
    reload_dir = model_dir + '/' + uuid

    # Check training statistics
    # -------------------------
    if prediction_type == 'regression':
        threshold = 0.6
        if 'perf_threshold' in config:
            threshold = float(config['perf_threshold'])

        integrative_utilities.training_statistics_file(reload_dir, 'test',
                                                       threshold, 'r2_score')
        score = integrative_utilities.read_training_statistics_file(
            reload_dir, 'test', 'r2_score')
    else:
        threshold = 0.7
        if 'perf_threshold' in config:
            threshold = float(config['perf_threshold'])
        integrative_utilities.training_statistics_file(reload_dir, 'test',
                                                       threshold,
                                                       'accuracy_score')
        score = integrative_utilities.read_training_statistics_file(
            reload_dir, 'test', 'accuracy_score')

    print("Final test score:", score)

    # Load second test set
    # --------------------
    data = pd.read_csv('%s_curated_external.csv' % prefix)

    predict = pfm.predict_from_model_file(tar_f,
                                          data,
                                          id_col=params.id_col,
                                          smiles_col=params.smiles_col,
                                          response_col=params.response_cols)
    pred_cols = [f for f in predict.columns if f.endswith('_pred')]

    pred = predict[pred_cols].to_numpy()

    # Check predictions
    # -----------------
    assert (
        pred.shape[0] == len(data)), 'Error: Incorrect number of predictions'
    assert (np.all(np.isfinite(pred))), 'Error: Predictions are not numbers'

    # Save predictions with experimental values
    # -----------------------------------------
    predict.reset_index(level=0, inplace=True)
    combined = pd.merge(data, predict, on=params.id_col, how='inner')
    pred_csv_name = '%s_curated_%s_%s_%s_%s_%d_%s_predict.csv' % (
        prefix, model_type, prediction_type, descriptor_type, featurizer,
        len(model.params.response_cols), model.params.splitter)
    combined.to_csv(pred_csv_name)
    assert (os.path.isfile(pred_csv_name)
            and os.path.getsize(pred_csv_name) > 0
            ), 'Error: Prediction file not created'

    return tar_f