コード例 #1
0
    def __init__(self, **kwargs):

        self.configs = dict()
        X = kwargs.pop('X')
        y = kwargs.pop('y', None)

        trbefile = kwargs.pop('trbefile', None)
        if trbefile is not None:
            batch_effects_train = fileio.load(trbefile)
        else:
            batch_effects_train = np.zeros([X.shape[0], 2])
        self.configs['batch_effects_train'] = batch_effects_train

        tsbefile = kwargs.pop('tsbefile', None)
        if tsbefile is not None:
            batch_effects_test = fileio.load(tsbefile)
        else:
            batch_effects_test = None
        self.configs['batch_effects_test'] = batch_effects_test

        self.configs['type'] = kwargs.pop('model_type', 'linear')
        self.configs['random_intercept'] = kwargs.pop('random_intercept',
                                                      'True') == 'True'
        self.configs['random_slope'] = kwargs.pop('random_slope',
                                                  'True') == 'True'
        self.configs['random_noise'] = kwargs.pop('random_noise',
                                                  'True') == 'True'
        self.configs['hetero_noise'] = kwargs.pop('hetero_noise',
                                                  'False') == 'True'
        self.configs['noise_model'] = kwargs.pop('noise_model', 'linear')
        self.configs['nn_hidden_neuron_num'] = int(
            kwargs.pop('nn_hidden_neuron_num', '2'))
        self.configs['new_site'] = kwargs.pop('new_site', 'False') == 'True'
        self.configs['newsite_training_idx'] = kwargs.pop(
            'newsite_training_idx', None)
        self.configs['pred_type'] = kwargs.pop('pred_type', 'single')

        if y is not None:
            self.hbr = HBR(np.squeeze(X), np.squeeze(batch_effects_train[:,
                                                                         0]),
                           np.squeeze(batch_effects_train[:, 1]),
                           np.squeeze(y), self.configs)
コード例 #2
0
def load_response_vars(datafile, maskfile=None, vol=True):
    """ load response variables (of any data type)"""

    if fileio.file_type(datafile) == 'nifti':
        dat = fileio.load_nifti(datafile, vol=vol)
        volmask = fileio.create_mask(dat, mask=maskfile)
        Y = fileio.vol2vec(dat, volmask).T
    else:
        Y = fileio.load(datafile)
        volmask = None
        if fileio.file_type(datafile) == 'cifti':
            Y = Y.T

    return Y, volmask
コード例 #3
0
ファイル: norm_hbr.py プロジェクト: Hesterhuijsdens/nispat
    def estimate(self, X, y, **kwargs):

        trbefile = kwargs.pop('trbefile', None)
        if trbefile is not None:
            batch_effects_train = fileio.load(trbefile)
        else:
            print(
                'Could not find batch-effects file! Initilizing all as zeros ...'
            )
            batch_effects_train = np.zeros([X.shape[0], 1])

        self.hbr.estimate(X, y, batch_effects_train)

        return self
コード例 #4
0
ファイル: norm_hbr.py プロジェクト: Hesterhuijsdens/nispat
    def predict(self, Xs, X=None, Y=None, **kwargs):

        tsbefile = kwargs.pop('tsbefile', None)
        if tsbefile is not None:
            batch_effects_test = fileio.load(tsbefile)
        else:
            print(
                'Could not find batch-effects file! Initilizing all as zeros ...'
            )
            batch_effects_test = np.zeros([Xs.shape[0], 1])

        pred_type = self.configs['pred_type']

        yhat, s2 = self.hbr.predict(Xs, batch_effects_test, pred=pred_type)

        return yhat.squeeze(), s2.squeeze()
コード例 #5
0
    def predict(self, Xs, X=None, Y=None, **kwargs):

        tsbefile = kwargs.pop('tsbefile', None)
        if tsbefile is not None:
            batch_effects_test = fileio.load(tsbefile)
        else:
            batch_effects_test = np.zeros([Xs.shape[0], 2])
        self.configs['batch_effects_test'] = batch_effects_test

        pred_type = self.configs['pred_type']

        yhat, s2 = self.hbr.predict(np.squeeze(Xs),
                                    np.squeeze(batch_effects_test[:, 0]),
                                    np.squeeze(batch_effects_test[:, 1]),
                                    pred=pred_type)
        return yhat, s2
コード例 #6
0
def rerun_nm(processing_dir, log_path, memory, duration, binary=False):
    """
    This function reruns all failed batched in processing_dir after collect_nm
    has identified he failed batches

    * Input:
        * processing_dir        -> Full path to the processing directory
        * memory                -> Memory requirements written as string
                                   for example 4gb or 500mb
        * duration               -> The approximate duration of the job, a
                                   string with HH:MM:SS for example 01:01:01

    written by (primarily) T Wolfers, (adapted) SM Kia
    """

    if binary:
        file_extentions = '.pkl'
        failed_batches = fileio.load(processing_dir + 'failed_batches' +
                                     file_extentions)
        shape = failed_batches.shape
        for n in range(0, shape[0]):
            jobpath = failed_batches[n, 0]
            print(jobpath)
            qsub_nm(job_path=jobpath,
                    log_path=log_path,
                    memory=memory,
                    duration=duration)
    else:
        file_extentions = '.txt'
        failed_batches = fileio.load_pd(processing_dir + 'failed_batches' +
                                        file_extentions)
        shape = failed_batches.shape
        for n in range(0, shape[0]):
            jobpath = failed_batches.iloc[n, 0]
            print(jobpath)
            qsub_nm(job_path=jobpath,
                    log_path=log_path,
                    memory=memory,
                    duration=duration)
コード例 #7
0
def estimate(respfile,
             covfile,
             maskfile=None,
             cvfolds=None,
             testcov=None,
             testresp=None,
             alg='gpr',
             configparam=None,
             saveoutput=True,
             outputsuffix=None):
    """ Estimate a normative model

    This will estimate a model in one of two settings according to the
    particular parameters specified (see below):

    * under k-fold cross-validation
        required settings 1) respfile 2) covfile 3) cvfolds>2
    * estimating a training dataset then applying to a second test dataset
        required sessting 1) respfile 2) covfile 3) testcov 4) testresp
    * estimating on a training dataset ouput of forward maps mean and se
        required sessting 1) respfile 2) covfile 3) testcov

    The models are estimated on the basis of data stored on disk in ascii or
    neuroimaging data formats (nifti or cifti). Ascii data should be in
    tab or space delimited format with the number of subjects in rows and the
    number of variables in columns. Neuroimaging data will be reshaped
    into the appropriate format

    Basic usage::

        estimate(respfile, covfile, [extra_arguments])

    where the variables are defined below. Note that either the cfolds
    parameter or (testcov, testresp) should be specified, but not both.

    :param respfile: response variables for the normative model
    :param covfile: covariates used to predict the response variable
    :param maskfile: mask used to apply to the data (nifti only)
    :param cvfolds: Number of cross-validation folds
    :param testcov: Test covariates
    :param testresp: Test responses
    :param alg: Algorithm for normative model
    :param configparam: Parameters controlling the estimation algorithm
    :param saveoutput: Save the output to disk? Otherwise returned as arrays
    :param outputsuffix: Text string to add to the output filenames

    All outputs are written to disk in the same format as the input. These are:

    :outputs: * yhat - predictive mean
              * ys2 - predictive variance
              * Hyp - hyperparameters
              * Z - deviance scores
              * Rho - Pearson correlation between true and predicted responses
              * pRho - parametric p-value for this correlation
              * rmse - root mean squared error between true/predicted responses
              * smse - standardised mean squared error

    The outputsuffix may be useful to estimate multiple normative models in the
    same directory (e.g. for custom cross-validation schemes)
    """

    # load data
    print("Processing data in " + respfile)
    X = fileio.load(covfile)
    Y, maskvol = load_response_vars(respfile, maskfile)
    if len(Y.shape) == 1:
        Y = Y[:, np.newaxis]
    if len(X.shape) == 1:
        X = X[:, np.newaxis]
    Nmod = Y.shape[1]

    if testcov is not None:
        # we have a separate test dataset
        Xte = fileio.load(testcov)
        testids = range(X.shape[0], X.shape[0] + Xte.shape[0])
        if len(Xte.shape) == 1:
            Xte = Xte[:, np.newaxis]
        if testresp is not None:
            Yte, testmask = load_response_vars(testresp, maskfile)
            if len(Yte.shape) == 1:
                Yte = Yte[:, np.newaxis]
        else:
            sub_te = Xte.shape[0]
            Yte = np.zeros([sub_te, Nmod])

        # treat as a single train-test split
        splits = CustomCV((range(0, X.shape[0]), ), (testids, ))

        Y = np.concatenate((Y, Yte), axis=0)
        X = np.concatenate((X, Xte), axis=0)

        # force the number of cross-validation folds to 1
        if cvfolds is not None and cvfolds != 1:
            print("Ignoring cross-valdation specification (test data given)")
        cvfolds = 1
    else:
        # we are running under cross-validation
        splits = KFold(n_splits=cvfolds)
        testids = range(0, X.shape[0])

    # find and remove bad variables from the response variables
    # note: the covariates are assumed to have already been checked
    nz = np.where(
        np.bitwise_and(np.isfinite(Y).any(axis=0),
                       np.var(Y, axis=0) != 0))[0]

    # Initialise normative model
    nm = norm_init(X, alg=alg, configparam=configparam)

    # run cross-validation loop
    Yhat = np.zeros_like(Y)
    S2 = np.zeros_like(Y)
    Hyp = np.zeros((Nmod, nm.n_params, cvfolds))

    Z = np.zeros_like(Y)
    nlZ = np.zeros((Nmod, cvfolds))

    for idx in enumerate(splits.split(X)):
        fold = idx[0]
        tr = idx[1][0]
        te = idx[1][1]

        # standardize responses and covariates, ignoring invalid entries
        iy, jy = np.ix_(tr, nz)
        mY = np.mean(Y[iy, jy], axis=0)
        sY = np.std(Y[iy, jy], axis=0)
        Yz = np.zeros_like(Y)
        Yz[:, nz] = (Y[:, nz] - mY) / sY
        mX = np.mean(X[tr, :], axis=0)
        sX = np.std(X[tr, :], axis=0)
        Xz = (X - mX) / sX

        # estimate the models for all subjects
        for i in range(0, len(nz)):  # range(0, Nmod):
            print("Estimating model ", i + 1, "of", len(nz))
            try:
                nm = norm_init(Xz[tr, :],
                               Yz[tr, nz[i]],
                               alg=alg,
                               configparam=configparam)
                Hyp[nz[i], :, fold] = nm.estimate(Xz[tr, :], Yz[tr, nz[i]])
                yhat, s2 = nm.predict(Xz[tr, :], Yz[tr, nz[i]], Xz[te, :],
                                      Hyp[nz[i], :, fold])

                Yhat[te, nz[i]] = yhat * sY[i] + mY[i]
                S2[te, nz[i]] = s2 * sY[i]**2
                nlZ[nz[i], fold] = nm.neg_log_lik
                if testcov is None:
                    Z[te, nz[i]] = (Y[te, nz[i]] - Yhat[te, nz[i]]) / \
                                   np.sqrt(S2[te, nz[i]])
                else:
                    if testresp is not None:
                        Z[te, nz[i]] = (Y[te, nz[i]] - Yhat[te, nz[i]]) / \
                                       np.sqrt(S2[te, nz[i]])

            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                print("Model ", i + 1, "of", len(nz),
                      "FAILED!..skipping and writing NaN to outputs")
                print("Exception:")
                print(e)
                print(exc_type, fname, exc_tb.tb_lineno)
                Hyp[nz[i], :, fold] = float('nan')

                Yhat[te, nz[i]] = float('nan')
                S2[te, nz[i]] = float('nan')
                nlZ[nz[i], fold] = float('nan')
                if testcov is None:
                    Z[te, nz[i]] = float('nan')
                else:
                    if testresp is not None:
                        Z[te, nz[i]] = float('nan')

    # compute performance metrics
    if testcov is None:
        MSE = np.mean((Y[testids, :] - Yhat[testids, :])**2, axis=0)
        RMSE = np.sqrt(MSE)
        # for the remaining variables, we need to ignore zero variances
        SMSE = np.zeros_like(MSE)
        Rho = np.zeros(Nmod)
        pRho = np.ones(Nmod)
        iy, jy = np.ix_(testids, nz)  # ids for tested samples nonzero values
        SMSE[nz] = MSE[nz] / np.var(Y[iy, jy], axis=0)
        Rho[nz], pRho[nz] = compute_pearsonr(Y[iy, jy], Yhat[iy, jy])
    else:
        if testresp is not None:
            MSE = np.mean((Y[testids, :] - Yhat[testids, :])**2, axis=0)
            RMSE = np.sqrt(MSE)
            # for the remaining variables, we need to ignore zero variances
            SMSE = np.zeros_like(MSE)
            Rho = np.zeros(Nmod)
            pRho = np.ones(Nmod)
            iy, jy = np.ix_(testids, nz)  # ids tested samples nonzero values
            SMSE[nz] = MSE[nz] / np.var(Y[iy, jy], axis=0)
            Rho[nz], pRho[nz] = compute_pearsonr(Y[iy, jy], Yhat[iy, jy])

    # Set writing options
    if saveoutput:
        print("Writing output ...")
        if fileio.file_type(respfile) == 'cifti' or \
           fileio.file_type(respfile) == 'nifti':
            exfile = respfile
        else:
            exfile = None
        if outputsuffix is not None:
            ext = str(outputsuffix) + fileio.file_extension(respfile)
        else:
            ext = fileio.file_extension(respfile)

        # Write output
        if testcov is None:
            fileio.save(Yhat[testids, :].T,
                        'yhat' + ext,
                        example=exfile,
                        mask=maskvol)
            fileio.save(S2[testids, :].T,
                        'ys2' + ext,
                        example=exfile,
                        mask=maskvol)
            fileio.save(Z[testids, :].T,
                        'Z' + ext,
                        example=exfile,
                        mask=maskvol)
            fileio.save(Rho, 'Rho' + ext, example=exfile, mask=maskvol)
            fileio.save(pRho, 'pRho' + ext, example=exfile, mask=maskvol)
            fileio.save(RMSE, 'rmse' + ext, example=exfile, mask=maskvol)
            fileio.save(SMSE, 'smse' + ext, example=exfile, mask=maskvol)
            if cvfolds is None:
                fileio.save(Hyp[:, :, 0],
                            'Hyp' + ext,
                            example=exfile,
                            mask=maskvol)
            else:
                for idx in enumerate(splits.split(X)):
                    fold = idx[0]
                    fileio.save(Hyp[:, :, fold],
                                'Hyp_' + str(fold + 1) + ext,
                                example=exfile,
                                mask=maskvol)
        else:
            if testresp is None:
                fileio.save(Yhat[testids, :].T,
                            'yhat' + ext,
                            example=exfile,
                            mask=maskvol)
                fileio.save(S2[testids, :].T,
                            'ys2' + ext,
                            example=exfile,
                            mask=maskvol)
                fileio.save(Hyp[:, :, 0],
                            'Hyp' + ext,
                            example=exfile,
                            mask=maskvol)
            else:
                fileio.save(Yhat[testids, :].T,
                            'yhat' + ext,
                            example=exfile,
                            mask=maskvol)
                fileio.save(S2[testids, :].T,
                            'ys2' + ext,
                            example=exfile,
                            mask=maskvol)
                fileio.save(Z[testids, :].T,
                            'Z' + ext,
                            example=exfile,
                            mask=maskvol)
                fileio.save(Rho, 'Rho' + ext, example=exfile, mask=maskvol)
                fileio.save(pRho, 'pRho' + ext, example=exfile, mask=maskvol)
                fileio.save(RMSE, 'rmse' + ext, example=exfile, mask=maskvol)
                fileio.save(SMSE, 'smse' + ext, example=exfile, mask=maskvol)
                if cvfolds is None:
                    fileio.save(Hyp[:, :, 0],
                                'Hyp' + ext,
                                example=exfile,
                                mask=maskvol)
                else:
                    for idx in enumerate(splits.split(X)):
                        fold = idx[0]
                        fileio.save(Hyp[:, :, fold],
                                    'Hyp_' + str(fold + 1) + ext,
                                    example=exfile,
                                    mask=maskvol)
    else:
        if testcov is None:
            output = (Yhat[testids, :], S2[testids, :], Hyp, Z[testids, :],
                      Rho, pRho, RMSE, SMSE)
        else:
            if testresp is None:
                output = (Yhat[testids, :], S2[testids, :], Hyp[testids, :])
            else:
                output = (Yhat[testids, :], S2[testids, :], Hyp, Z[testids, :],
                          Rho, pRho, RMSE, SMSE)
        return output
コード例 #8
0
ファイル: normative_parallel.py プロジェクト: lindenmp/nispat
def collect_nm(processing_dir, collect=False, binary=False):
    """This function checks and collects all batches.

    ** Input:
        * processing_dir        -> Full path to the processing directory
        * collect               -> If True data is checked for failed batches
                                and collected; if False data is just checked

    ** Output:
        * Text files containing all results accross all batches the combined
          output

    written by (primarily) T Wolfers, (adapted) SM Kia
    """
    # import of necessary modules
    import os
    import sys
    import glob
    import numpy as np
    import pandas as pd
    try:
        import nispat.fileio as fileio
    except ImportError:
        pass
        path = os.path.abspath(os.path.dirname(__file__))
        if path not in sys.path:
            sys.path.append(path)
            del path
        import fileio

    if binary:
        file_extentions = '.pkl'
    else:
        file_extentions = '.txt'

    # detect number of subjects, batches, hyperparameters and CV
    file_example = glob.glob(processing_dir + 'batch_1/' + 'yhat' +
                             file_extentions)
    if binary is False:
        file_example = fileio.load(file_example[0])
    else:
        file_example = pd.read_pickle(file_example[0])
    numsubjects = file_example.shape[0]
    batch_size = file_example.shape[1]

    all_Hyptxt = glob.glob(processing_dir + 'batch_*/' + 'Hyp*')
    if all_Hyptxt != []:
        first_Hyptxt = fileio.load(all_Hyptxt[0])
        first_Hyptxt = first_Hyptxt.transpose()
        nHyp = len(first_Hyptxt)
        dir_first_Hyptxt = os.path.dirname(all_Hyptxt[0])
        all_crossval = glob.glob(dir_first_Hyptxt + '/' + 'Hyp*')
        n_crossval = len(all_crossval)

    # artificially creates files for batches that were not executed
    count = 0
    batch_fail = []
    for batch in glob.glob(processing_dir + 'batch_*/'):
        filepath = glob.glob(batch + 'yhat*')
        if filepath == []:
            count = count + 1
            batch1 = glob.glob(batch + '/*.sh')
            print(batch1)
            batch_fail.append(batch1)
            if collect is True:
                pRho = np.ones(batch_size)
                pRho = pRho.transpose()
                pRho = pd.Series(pRho)
                fileio.save(pRho, batch + 'pRho' + file_extentions)

                Rho = np.zeros(batch_size)
                Rho = Rho.transpose()
                Rho = pd.Series(Rho)
                fileio.save(Rho, batch + 'Rho' + file_extentions)

                rmse = np.zeros(batch_size)
                rmse = rmse.transpose()
                rmse = pd.Series(rmse)
                fileio.save(rmse, batch + 'rmse' + file_extentions)

                smse = np.zeros(batch_size)
                smse = smse.transpose()
                smse = pd.Series(smse)
                fileio.save(smse, batch + 'smse' + file_extentions)

                expv = np.zeros(batch_size)
                expv = expv.transpose()
                expv = pd.Series(expv)
                fileio.save(expv, batch + 'expv' + file_extentions)

                msll = np.zeros(batch_size)
                msll = msll.transpose()
                msll = pd.Series(msll)
                fileio.save(msll, batch + 'msll' + file_extentions)

                yhat = np.zeros([batch_size, numsubjects])
                yhat = pd.DataFrame(yhat)
                fileio.save(yhat, batch + 'yhat' + file_extentions)

                ys2 = np.zeros([batch_size, numsubjects])
                ys2 = pd.DataFrame(ys2)
                fileio.save(ys2, batch + 'ys2' + file_extentions)

                Z = np.zeros([batch_size, numsubjects])
                Z = pd.DataFrame(Z)
                fileio.save(Z, batch + 'Z' + file_extentions)

                for n in range(1, n_crossval + 1):
                    hyp = np.zeros([batch_size, nHyp])
                    hyp = pd.DataFrame(hyp)
                    fileio.save(hyp, batch + 'hyp' + file_extentions)
        else:  # if more than 10% of yhat is nan then consider the batch as a failed batch
            yhat = fileio.load(filepath[0])
            if np.count_nonzero(~np.isnan(yhat)) / (np.prod(yhat.shape)) < 0.9:
                count = count + 1
                batch1 = glob.glob(batch + '/*.sh')
                print('More than 10% nans in ' + batch1[0])
                batch_fail.append(batch1)

    # list batches that were not executed
    print('Number of batches that failed:' + str(count))
    batch_fail_df = pd.DataFrame(batch_fail)
    if file_extentions == '.txt':
        fileio.save_pd(batch_fail_df,
                       processing_dir + 'failed_batches' + file_extentions)
    else:
        fileio.save(batch_fail_df,
                    processing_dir + 'failed_batches' + file_extentions)

    # combines all output files across batches
    if collect is True:
        pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho*')
        if pRho_filenames:
            pRho_filenames = fileio.sort_nicely(pRho_filenames)
            pRho_dfs = []
            for pRho_filename in pRho_filenames:
                pRho_dfs.append(pd.DataFrame(fileio.load(pRho_filename)))
            pRho_combined = pd.concat(pRho_dfs, ignore_index=True)
            fileio.save(pRho_combined,
                        processing_dir + 'pRho' + file_extentions)

        Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho*')
        if pRho_filenames:
            Rho_filenames = fileio.sort_nicely(Rho_filenames)
            Rho_dfs = []
            for Rho_filename in Rho_filenames:
                Rho_dfs.append(pd.DataFrame(fileio.load(Rho_filename)))
            Rho_combined = pd.concat(Rho_dfs, ignore_index=True)
            fileio.save(Rho_combined, processing_dir + 'Rho' + file_extentions)

        Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z*')
        if Z_filenames:
            Z_filenames = fileio.sort_nicely(Z_filenames)
            Z_dfs = []
            for Z_filename in Z_filenames:
                Z_dfs.append(pd.DataFrame(fileio.load(Z_filename)))
            Z_combined = pd.concat(Z_dfs, ignore_index=True)
            fileio.save(Z_combined, processing_dir + 'Z' + file_extentions)

        yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat*')
        if yhat_filenames:
            yhat_filenames = fileio.sort_nicely(yhat_filenames)
            yhat_dfs = []
            for yhat_filename in yhat_filenames:
                yhat_dfs.append(pd.DataFrame(fileio.load(yhat_filename)))
            yhat_combined = pd.concat(yhat_dfs, ignore_index=True)
            fileio.save(yhat_combined,
                        processing_dir + 'yhat' + file_extentions)

        ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2*')
        if ys2_filenames:
            ys2_filenames = fileio.sort_nicely(ys2_filenames)
            ys2_dfs = []
            for ys2_filename in ys2_filenames:
                ys2_dfs.append(pd.DataFrame(fileio.load(ys2_filename)))
            ys2_combined = pd.concat(ys2_dfs, ignore_index=True)
            fileio.save(ys2_combined, processing_dir + 'ys2' + file_extentions)

        rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'rmse*')
        if rmse_filenames:
            rmse_filenames = fileio.sort_nicely(rmse_filenames)
            rmse_dfs = []
            for rmse_filename in rmse_filenames:
                rmse_dfs.append(pd.DataFrame(fileio.load(rmse_filename)))
            rmse_combined = pd.concat(rmse_dfs, ignore_index=True)
            fileio.save(rmse_combined,
                        processing_dir + 'rmse' + file_extentions)

        smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'smse*')
        if rmse_filenames:
            smse_filenames = fileio.sort_nicely(smse_filenames)
            smse_dfs = []
            for smse_filename in smse_filenames:
                smse_dfs.append(pd.DataFrame(fileio.load(smse_filename)))
            smse_combined = pd.concat(smse_dfs, ignore_index=True)
            fileio.save(smse_combined,
                        processing_dir + 'smse' + file_extentions)

        expv_filenames = glob.glob(processing_dir + 'batch_*/' + 'expv*')
        if expv_filenames:
            expv_filenames = fileio.sort_nicely(expv_filenames)
            expv_dfs = []
            for expv_filename in expv_filenames:
                expv_dfs.append(pd.DataFrame(fileio.load(expv_filename)))
            expv_combined = pd.concat(expv_dfs, ignore_index=True)
            fileio.save(expv_combined,
                        processing_dir + 'expv' + file_extentions)

        msll_filenames = glob.glob(processing_dir + 'batch_*/' + 'msll*')
        if msll_filenames:
            msll_filenames = fileio.sort_nicely(msll_filenames)
            msll_dfs = []
            for msll_filename in msll_filenames:
                msll_dfs.append(pd.DataFrame(fileio.load(msll_filename)))
            msll_combined = pd.concat(msll_dfs, ignore_index=True)
            fileio.save(msll_combined,
                        processing_dir + 'msll' + file_extentions)

        for n in range(1, n_crossval + 1):
            Hyp_filenames = glob.glob(processing_dir + 'batch_*/' + 'Hyp_' +
                                      str(n) + '.*')
            if Hyp_filenames:
                Hyp_filenames = fileio.sort_nicely(Hyp_filenames)
                Hyp_dfs = []
                for Hyp_filename in Hyp_filenames:
                    Hyp_dfs.append(pd.DataFrame(fileio.load(Hyp_filename)))
                Hyp_combined = pd.concat(Hyp_dfs, ignore_index=True)
                fileio.save(Hyp_combined,
                            processing_dir + 'Hyp_' + str(n) + file_extentions)
コード例 #9
0
ファイル: normative.py プロジェクト: zpopov/nispat
def transfer(covfile,
             respfile,
             testcov=None,
             testresp=None,
             maskfile=None,
             **kwargs):

    if (not 'model_path' in list(kwargs.keys())) or \
        (not 'output_path' in list(kwargs.keys())) or \
        (not 'trbefile' in list(kwargs.keys())):
        return
    else:
        model_path = kwargs.pop('model_path')
        output_path = kwargs.pop('output_path')
        trbefile = kwargs.pop('trbefile')

    outputsuffix = kwargs.pop('outputsuffix', None)
    tsbefile = kwargs.pop('tsbefile', None)

    job_id = kwargs.pop('job_id', None)
    batch_size = kwargs.pop('batch_size', None)
    if batch_size is not None:
        batch_size = int(batch_size)
        job_id = int(job_id) - 1

    if not os.path.isdir(output_path):
        os.mkdir(output_path)

    transferred_models_path = os.path.join(output_path, 'Models')
    if not os.path.isdir(transferred_models_path):
        os.mkdir(transferred_models_path)

    # load data
    print("Loading data ...")
    X = fileio.load(covfile)
    Y, maskvol = load_response_vars(respfile, maskfile)
    if len(Y.shape) == 1:
        Y = Y[:, np.newaxis]
    if len(X.shape) == 1:
        X = X[:, np.newaxis]
    feature_num = Y.shape[1]
    mY = np.mean(Y, axis=0)
    sY = np.std(Y, axis=0)

    if trbefile is not None:
        batch_effects_train = fileio.load(trbefile)
    else:
        batch_effects_train = np.zeros([X.shape[0], 2])

    if testcov is not None:
        # we have a separate test dataset
        Xte = fileio.load(testcov)
        if len(Xte.shape) == 1:
            Xte = Xte[:, np.newaxis]
        ts_sample_num = Xte.shape[0]
        if testresp is not None:
            Yte, testmask = load_response_vars(testresp, maskfile)
            if len(Yte.shape) == 1:
                Yte = Yte[:, np.newaxis]
        else:
            Yte = np.zeros([ts_sample_num, feature_num])

        if tsbefile is not None:
            batch_effects_test = fileio.load(tsbefile)
        else:
            batch_effects_test = np.zeros([Xte.shape[0], 2])

    Yhat = np.zeros([ts_sample_num, feature_num])
    S2 = np.zeros([ts_sample_num, feature_num])
    Z = np.zeros([ts_sample_num, feature_num])

    # estimate the models for all subjects
    for i in range(feature_num):

        nm = norm_init(X)
        if batch_size is not None:  # when using nirmative_parallel
            print("Transferting model ", job_id * batch_size + i)
            nm = nm.load(
                os.path.join(model_path,
                             'NM_0_' + str(job_id * batch_size + i) + '.pkl'))
        else:
            print("Transferting model ", i + 1, "of", feature_num)
            nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) + '.pkl'))

        nm = nm.estimate_on_new_sites(X, Y[:, i], batch_effects_train)
        if batch_size is not None:
            nm.save(
                os.path.join(
                    transferred_models_path,
                    'NM_transfered_' + str(job_id * batch_size + i) + '.pkl'))
        else:
            nm.save(
                os.path.join(transferred_models_path,
                             'NM_transfered_' + str(i) + '.pkl'))

        if testcov is not None:
            yhat, s2 = nm.predict_on_new_sites(Xte, batch_effects_test)
            Yhat[:, i] = yhat
            S2[:, i] = s2

    if testresp is None:
        save_results(respfile, Yhat, S2, maskvol, outputsuffix=outputsuffix)
        return (Yhat, S2)
    else:
        Z = (Yte - Yhat) / np.sqrt(S2)

        print("Evaluating the model ...")
        results = evaluate(Yte, Yhat, S2=S2, mY=mY, sY=sY)

        save_results(respfile,
                     Yhat,
                     S2,
                     maskvol,
                     Z=Z,
                     results=results,
                     outputsuffix=outputsuffix)

        return (Yhat, S2, Z)
コード例 #10
0
ファイル: normative.py プロジェクト: zpopov/nispat
def predict(covfile, respfile=None, maskfile=None, **kwargs):

    model_path = kwargs.pop('model_path', 'Models')
    job_id = kwargs.pop('job_id', None)
    batch_size = kwargs.pop('batch_size', None)
    output_path = kwargs.pop('output_path', '')
    outputsuffix = kwargs.pop('outputsuffix', None)

    if not os.path.isdir(model_path):
        print('Models directory does not exist!')
        return
    else:
        with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file:
            meta_data = pickle.load(file)
        standardize = meta_data['standardize']
        mY = meta_data['mean_resp']
        sY = meta_data['std_resp']
        mX = meta_data['mean_cov']
        sX = meta_data['std_cov']

    if batch_size is not None:
        batch_size = int(batch_size)
        job_id = int(job_id) - 1

    if (output_path is not '') and (not os.path.isdir(output_path)):
        os.mkdir(output_path)

    # load data
    print("Loading data ...")
    X = fileio.load(covfile)
    if len(X.shape) == 1:
        X = X[:, np.newaxis]

    sample_num = X.shape[0]
    feature_num = len(glob.glob(os.path.join(model_path, 'NM_*.pkl')))

    # run cross-validation loop
    Yhat = np.zeros([sample_num, feature_num])
    S2 = np.zeros([sample_num, feature_num])
    Z = np.zeros([sample_num, feature_num])

    if standardize:
        Xz = (X - mX[0]) / sX[0]
    else:
        Xz = X

    # estimate the models for all subjects
    for i in range(feature_num):
        print("Prediction by model ", i + 1, "of", feature_num)
        nm = norm_init(Xz)
        nm = nm.load(
            os.path.join(model_path, 'NM_' + str(0) + '_' + str(i) + '.pkl'))
        yhat, s2 = nm.predict(Xz, **kwargs)

        if standardize:
            Yhat[:, i] = yhat * sY[0][i] + mY[0][i]
            S2[:, i] = s2 * sY[0][i]**2
        else:
            Yhat[:, i] = yhat
            S2[:, i] = s2

    if respfile is None:
        return (Yhat, S2)

    else:
        Y, maskvol = load_response_vars(respfile, maskfile)
        if len(Y.shape) == 1:
            Y = Y[:, np.newaxis]

        Z = (Y - Yhat) / np.sqrt(S2)

        print("Evaluating the model ...")
        results = evaluate(Y,
                           Yhat,
                           S2=S2,
                           metrics=['Rho', 'RMSE', 'SMSE', 'EXPV'])

        print("Evaluations Writing outputs ...")
        save_results(respfile,
                     Yhat,
                     S2,
                     maskvol,
                     Z=Z,
                     outputsuffix=outputsuffix,
                     results=results,
                     save_path=output_path)

        return (Yhat, S2, Z)
コード例 #11
0
ファイル: normative.py プロジェクト: zpopov/nispat
def estimate(covfile, respfile, **kwargs):
    """ Estimate a normative model

    This will estimate a model in one of two settings according to the
    particular parameters specified (see below):

    * under k-fold cross-validation
        required settings 1) respfile 2) covfile 3) cvfolds>=2
    * estimating a training dataset then applying to a second test dataset
        required sessting 1) respfile 2) covfile 3) testcov 4) testresp
    * estimating on a training dataset ouput of forward maps mean and se
        required sessting 1) respfile 2) covfile 3) testcov

    The models are estimated on the basis of data stored on disk in ascii or
    neuroimaging data formats (nifti or cifti). Ascii data should be in
    tab or space delimited format with the number of subjects in rows and the
    number of variables in columns. Neuroimaging data will be reshaped
    into the appropriate format

    Basic usage::

        estimate(respfile, covfile, [extra_arguments])

    where the variables are defined below. Note that either the cfolds
    parameter or (testcov, testresp) should be specified, but not both.

    :param respfile: response variables for the normative model
    :param covfile: covariates used to predict the response variable
    :param maskfile: mask used to apply to the data (nifti only)
    :param cvfolds: Number of cross-validation folds
    :param testcov: Test covariates
    :param testresp: Test responses
    :param alg: Algorithm for normative model
    :param configparam: Parameters controlling the estimation algorithm
    :param saveoutput: Save the output to disk? Otherwise returned as arrays
    :param outputsuffix: Text string to add to the output filenames

    All outputs are written to disk in the same format as the input. These are:

    :outputs: * yhat - predictive mean
              * ys2 - predictive variance
              * nm - normative model
              * Z - deviance scores
              * Rho - Pearson correlation between true and predicted responses
              * pRho - parametric p-value for this correlation
              * rmse - root mean squared error between true/predicted responses
              * smse - standardised mean squared error

    The outputsuffix may be useful to estimate multiple normative models in the
    same directory (e.g. for custom cross-validation schemes)
    """

    # parse keyword arguments
    maskfile = kwargs.pop('maskfile', None)
    cvfolds = kwargs.pop('cvfolds', None)
    testcov = kwargs.pop('testcov', None)
    testresp = kwargs.pop('testresp', None)
    alg = kwargs.pop('alg', 'gpr')
    saveoutput = kwargs.pop('saveoutput', 'True') == 'True'
    savemodel = kwargs.pop('savemodel', 'False') == 'True'
    outputsuffix = kwargs.pop('outputsuffix', None)
    standardize = kwargs.pop('standardize', True)

    if savemodel and not os.path.isdir('Models'):
        os.mkdir('Models')

    # load data
    print("Processing data in " + respfile)
    X = fileio.load(covfile)
    Y, maskvol = load_response_vars(respfile, maskfile)
    if len(Y.shape) == 1:
        Y = Y[:, np.newaxis]
    if len(X.shape) == 1:
        X = X[:, np.newaxis]
    Nmod = Y.shape[1]

    if testcov is not None:  # we have a separate test dataset

        run_cv = False
        cvfolds = 1
        Xte = fileio.load(testcov)
        testids = range(X.shape[0], X.shape[0] + Xte.shape[0])
        if len(Xte.shape) == 1:
            Xte = Xte[:, np.newaxis]
        if testresp is not None:
            Yte, testmask = load_response_vars(testresp, maskfile)
            if len(Yte.shape) == 1:
                Yte = Yte[:, np.newaxis]
        else:
            sub_te = Xte.shape[0]
            Yte = np.zeros([sub_te, Nmod])

        # treat as a single train-test split
        splits = CustomCV((range(0, X.shape[0]), ), (testids, ))

        Y = np.concatenate((Y, Yte), axis=0)
        X = np.concatenate((X, Xte), axis=0)

    else:
        run_cv = True
        # we are running under cross-validation
        splits = KFold(n_splits=cvfolds)
        testids = range(0, X.shape[0])

    # find and remove bad variables from the response variables
    # note: the covariates are assumed to have already been checked
    nz = np.where(
        np.bitwise_and(np.isfinite(Y).any(axis=0),
                       np.var(Y, axis=0) != 0))[0]

    # run cross-validation loop
    Yhat = np.zeros_like(Y)
    S2 = np.zeros_like(Y)
    Z = np.zeros_like(Y)
    nlZ = np.zeros((Nmod, cvfolds))

    mean_resp = []
    std_resp = []
    mean_cov = []
    std_cov = []

    for idx in enumerate(splits.split(X)):

        fold = idx[0]
        tr = idx[1][0]
        te = idx[1][1]

        # standardize responses and covariates, ignoring invalid entries
        iy, jy = np.ix_(tr, nz)
        mY = np.mean(Y[iy, jy], axis=0)
        sY = np.std(Y[iy, jy], axis=0)
        mean_resp.append(mY)
        std_resp.append(sY)
        if standardize:
            Yz = np.zeros_like(Y)
            Yz[:, nz] = (Y[:, nz] - mY) / sY
            mX = np.mean(X[tr, :], axis=0)
            sX = np.std(X[tr, :], axis=0)
            Xz = (X - mX) / sX
            mean_resp.append(mY)
            std_resp.append(sY)
            mean_cov.append(mX)
            std_cov.append(sX)
        else:
            Yz = Y
            Xz = X

        # estimate the models for all subjects
        for i in range(0, len(nz)):
            print("Estimating model ", i + 1, "of", len(nz))
            nm = norm_init(Xz[tr, :], Yz[tr, nz[i]], alg=alg, **kwargs)
            try:
                nm = nm.estimate(Xz[tr, :], Yz[tr, nz[i]])

                yhat, s2 = nm.predict(Xz[te, :], Xz[tr, :], Yz[tr, nz[i]],
                                      **kwargs)

                if savemodel:
                    nm.save('Models/NM_' + str(fold) + '_' + str(nz[i]) +
                            '.pkl')

                if standardize:
                    Yhat[te, nz[i]] = yhat * sY[i] + mY[i]
                    S2[te, nz[i]] = s2 * sY[i]**2
                else:
                    Yhat[te, nz[i]] = yhat
                    S2[te, nz[i]] = s2

                nlZ[nz[i], fold] = nm.neg_log_lik
                if (run_cv or testresp is not None):
                    Z[te, nz[i]] = (Y[te, nz[i]] - Yhat[te, nz[i]]) / \
                                   np.sqrt(S2[te, nz[i]])

            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                print("Model ", i + 1, "of", len(nz),
                      "FAILED!..skipping and writing NaN to outputs")
                print("Exception:")
                print(e)
                print(exc_type, fname, exc_tb.tb_lineno)

                Yhat[te, nz[i]] = float('nan')
                S2[te, nz[i]] = float('nan')
                nlZ[nz[i], fold] = float('nan')
                if testcov is None:
                    Z[te, nz[i]] = float('nan')
                else:
                    if testresp is not None:
                        Z[te, nz[i]] = float('nan')
    if savemodel:
        print('Saving model meta-data...')
        with open('Models/meta_data.md', 'wb') as file:
            pickle.dump(
                {
                    'valid_voxels': nz,
                    'fold_num': cvfolds,
                    'mean_resp': mean_resp,
                    'std_resp': std_resp,
                    'mean_cov': mean_cov,
                    'std_cov': std_cov,
                    'regressor': alg,
                    'standardize': standardize
                }, file)

    # compute performance metrics
    if (run_cv or testresp is not None):
        print("Evaluating the model ...")
        results = evaluate(Y[testids, :],
                           Yhat[testids, :],
                           S2=S2[testids, :],
                           mY=mean_resp[0],
                           sY=std_resp[0])

    # Set writing options
    if saveoutput:
        if (run_cv or testresp is not None):
            save_results(respfile,
                         Yhat[testids, :],
                         S2[testids, :],
                         maskvol,
                         Z=Z[testids, :],
                         results=results,
                         outputsuffix=outputsuffix)

        else:
            save_results(respfile,
                         Yhat[testids, :],
                         S2[testids, :],
                         maskvol,
                         outputsuffix=outputsuffix)

    else:
        if (run_cv or testresp is not None):
            output = (Yhat[testids, :], S2[testids, :], nm, Z[testids, :],
                      results)
        else:
            output = (Yhat[testids, :], S2[testids, :], nm)

        return output
コード例 #12
0
def extend(covfile, respfile, maskfile=None, **kwargs):
    
    alg = kwargs.pop('alg')
    if alg != 'hbr':
        print('Model extention is only possible for HBR models.')
        return
    elif (not 'model_path' in list(kwargs.keys())) or \
        (not 'output_path' in list(kwargs.keys())) or \
        (not 'trbefile' in list(kwargs.keys())) or \
        (not 'dummycovfile' in list(kwargs.keys()))or \
        (not 'dummybefile' in list(kwargs.keys())):
            print('InputError: Some mandatory arguments are missing.')
            return
    else:
        model_path = kwargs.pop('model_path')
        output_path = kwargs.pop('output_path')
        trbefile = kwargs.pop('trbefile')
        dummycovfile = kwargs.pop('dummycovfile')
        dummybefile = kwargs.pop('dummybefile')
    
    informative_prior = kwargs.pop('job_id', 'False') == 'True'
    generation_factor = int(kwargs.pop('generation_factor', '10'))
    job_id = kwargs.pop('job_id', None)
    batch_size = kwargs.pop('batch_size', None)
    if batch_size is not None:
        batch_size = int(batch_size)
        job_id = int(job_id) - 1
    
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
            
    # load data
    print("Loading data ...")
    X = fileio.load(covfile)
    Y, maskvol = load_response_vars(respfile, maskfile)
    batch_effects_train = fileio.load(trbefile)
    X_dummy = fileio.load(dummycovfile)
    batch_effects_dummy = fileio.load(dummybefile)
    
    if len(Y.shape) == 1:
        Y = Y[:, np.newaxis]
    if len(X.shape) == 1:
        X = X[:, np.newaxis]
    if len(X_dummy.shape) == 1:
        X_dummy = X_dummy[:, np.newaxis]
    feature_num = Y.shape[1]
    
    # estimate the models for all subjects
    for i in range(feature_num):
              
        nm = norm_init(X)
        if batch_size is not None: # when using nirmative_parallel
            print("Extending model ", job_id*batch_size+i)
            nm = nm.load(os.path.join(model_path, 'NM_0_' + 
                                      str(job_id*batch_size+i) + '.pkl'))
        else:
            print("Extending model ", i+1, "of", feature_num)
            nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) + '.pkl'))
        
        nm = nm.extend(X, Y[:,i:i+1], batch_effects_train, X_dummy, batch_effects_dummy, 
               samples=generation_factor, informative_prior=informative_prior)
        
        if batch_size is not None: 
            nm.save(os.path.join(output_path, 'NM_0_' + 
                             str(job_id*batch_size+i) + '.pkl'))
        else:
            nm.save(os.path.join(output_path, 'NM_0_' + 
                             str(i) + '.pkl'))
コード例 #13
0
def fit(covfile, respfile, **kwargs):
    
    # parse keyword arguments 
    maskfile = kwargs.pop('maskfile',None)
    alg = kwargs.pop('alg','gpr')
    savemodel = kwargs.pop('savemodel','True')=='True'
    standardize = kwargs.pop('standardize',True)
    
    if savemodel and not os.path.isdir('Models'):
        os.mkdir('Models')

    # load data
    print("Processing data in " + respfile)
    X = fileio.load(covfile)
    Y, maskvol = load_response_vars(respfile, maskfile)
    if len(Y.shape) == 1:
        Y = Y[:, np.newaxis]
    if len(X.shape) == 1:
        X = X[:, np.newaxis]
    
    # find and remove bad variables from the response variables
    # note: the covariates are assumed to have already been checked
    nz = np.where(np.bitwise_and(np.isfinite(Y).any(axis=0),
                                 np.var(Y, axis=0) != 0))[0]

    mean_resp = []
    std_resp = []
    mean_cov = []
    std_cov = []

    # standardize responses and covariates, ignoring invalid entries
    mY = np.mean(Y[:, nz], axis=0)
    sY = np.std(Y[:, nz], axis=0)
    mean_resp.append(mY)
    std_resp.append(sY)
    if standardize:
        Yz = np.zeros_like(Y)
        Yz[:, nz] = (Y[:, nz] - mY) / sY
        mX = np.mean(X, axis=0)
        sX = np.std(X,  axis=0)
        Xz = (X - mX) / sX
        mean_resp.append(mY)
        std_resp.append(sY)
        mean_cov.append(mX)
        std_cov.append(sX)
    else:
        Yz = Y
        Xz = X

    # estimate the models for all subjects
    for i in range(0, len(nz)):  
        print("Estimating model ", i+1, "of", len(nz))
        nm = norm_init(Xz, Yz[:, nz[i]], alg=alg, **kwargs)
        nm = nm.estimate(Xz, Yz[:, nz[i]], **kwargs)     
            
        if savemodel:
            nm.save('Models/NM_' + str(0) + '_' + str(nz[i]) + '.pkl' )

    if savemodel:
        print('Saving model meta-data...')
        with open('Models/meta_data.md', 'wb') as file:
            pickle.dump({'valid_voxels':nz,
                         'mean_resp':mean_resp, 'std_resp':std_resp, 
                         'mean_cov':mean_cov, 'std_cov':std_cov, 
                         'regressor':alg, 'standardize':standardize}, file)
        
    return nm
コード例 #14
0
def collect_nm(processing_dir,
               job_name,
               func='estimate',
               collect=False,
               binary=False,
               batch_size=None,
               outputsuffix=''):
    """This function checks and collects all batches.

    ** Input:
        * processing_dir        -> Full path to the processing directory
        * collect               -> If True data is checked for failed batches
                                and collected; if False data is just checked

    ** Output:
        * Text files containing all results accross all batches the combined
          output

    written by (primarily) T Wolfers, (adapted) SM Kia
    """

    if binary:
        file_extentions = '.pkl'
    else:
        file_extentions = '.txt'

    # detect number of subjects, batches, hyperparameters and CV
    batches = glob.glob(processing_dir + 'batch_*/')
    file_example = []
    for batch in batches:
        if file_example == []:
            file_example = glob.glob(batch + 'yhat' + outputsuffix +
                                     file_extentions)
        else:
            break
    if binary is False:
        file_example = fileio.load(file_example[0])
    else:
        file_example = pd.read_pickle(file_example[0])
    numsubjects = file_example.shape[0]
    batch_size = file_example.shape[1]

    # artificially creates files for batches that were not executed
    count = 0
    batch_fail = []
    batch_dirs = glob.glob(processing_dir + 'batch_*/')
    batch_dirs = fileio.sort_nicely(batch_dirs)
    for batch in batch_dirs:
        filepath = glob.glob(batch + 'yhat' + outputsuffix + '*')
        if filepath == []:
            count = count + 1
            batch1 = glob.glob(batch + '/' + job_name + '*.sh')
            print(batch1)
            batch_fail.append(batch1)
            if collect is True:
                pRho = np.ones(batch_size)
                pRho = pRho.transpose()
                pRho = pd.Series(pRho)
                fileio.save(pRho,
                            batch + 'pRho' + outputsuffix + file_extentions)

                Rho = np.zeros(batch_size)
                Rho = Rho.transpose()
                Rho = pd.Series(Rho)
                fileio.save(Rho,
                            batch + 'Rho' + outputsuffix + file_extentions)

                rmse = np.zeros(batch_size)
                rmse = rmse.transpose()
                rmse = pd.Series(rmse)
                fileio.save(rmse,
                            batch + 'RMSE' + outputsuffix + file_extentions)

                smse = np.zeros(batch_size)
                smse = smse.transpose()
                smse = pd.Series(smse)
                fileio.save(smse,
                            batch + 'SMSE' + outputsuffix + file_extentions)

                expv = np.zeros(batch_size)
                expv = expv.transpose()
                expv = pd.Series(expv)
                fileio.save(expv,
                            batch + 'EXPV' + outputsuffix + file_extentions)

                msll = np.zeros(batch_size)
                msll = msll.transpose()
                msll = pd.Series(msll)
                fileio.save(msll,
                            batch + 'MSLL' + outputsuffix + file_extentions)

                yhat = np.zeros([numsubjects, batch_size])
                yhat = pd.DataFrame(yhat)
                fileio.save(yhat,
                            batch + 'yhat' + outputsuffix + file_extentions)

                ys2 = np.zeros([numsubjects, batch_size])
                ys2 = pd.DataFrame(ys2)
                fileio.save(ys2,
                            batch + 'ys2' + outputsuffix + file_extentions)

                Z = np.zeros([numsubjects, batch_size])
                Z = pd.DataFrame(Z)
                fileio.save(Z, batch + 'Z' + outputsuffix + file_extentions)

                if not os.path.isdir(batch + 'Models'):
                    os.mkdir('Models')

        else:  # if more than 10% of yhat is nan then consider the batch as a failed batch
            yhat = fileio.load(filepath[0])
            if np.count_nonzero(~np.isnan(yhat)) / (np.prod(yhat.shape)) < 0.9:
                count = count + 1
                batch1 = glob.glob(batch + '/' + job_name + '*.sh')
                print('More than 10% nans in ' + batch1[0])
                batch_fail.append(batch1)

    # list batches that were not executed
    print('Number of batches that failed:' + str(count))
    batch_fail_df = pd.DataFrame(batch_fail)
    if file_extentions == '.txt':
        fileio.save_pd(batch_fail_df,
                       processing_dir + 'failed_batches' + file_extentions)
    else:
        fileio.save(batch_fail_df,
                    processing_dir + 'failed_batches' + file_extentions)

    # combines all output files across batches
    if collect is True:
        pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho' +
                                   outputsuffix + '*')
        if pRho_filenames:
            pRho_filenames = fileio.sort_nicely(pRho_filenames)
            pRho_dfs = []
            for pRho_filename in pRho_filenames:
                pRho_dfs.append(pd.DataFrame(fileio.load(pRho_filename)))
            pRho_dfs = pd.concat(pRho_dfs, ignore_index=True, axis=0)
            fileio.save(
                pRho_dfs,
                processing_dir + 'pRho' + outputsuffix + file_extentions)
            del pRho_dfs

        Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho' +
                                  outputsuffix + '*')
        if pRho_filenames:
            Rho_filenames = fileio.sort_nicely(Rho_filenames)
            Rho_dfs = []
            for Rho_filename in Rho_filenames:
                Rho_dfs.append(pd.DataFrame(fileio.load(Rho_filename)))
            Rho_dfs = pd.concat(Rho_dfs, ignore_index=True, axis=0)
            fileio.save(
                Rho_dfs,
                processing_dir + 'Rho' + outputsuffix + file_extentions)
            del Rho_dfs

        Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z' +
                                outputsuffix + '*')
        if Z_filenames:
            Z_filenames = fileio.sort_nicely(Z_filenames)
            Z_dfs = []
            for Z_filename in Z_filenames:
                Z_dfs.append(pd.DataFrame(fileio.load(Z_filename)))
            Z_dfs = pd.concat(Z_dfs, ignore_index=True, axis=1)
            fileio.save(Z_dfs,
                        processing_dir + 'Z' + outputsuffix + file_extentions)
            del Z_dfs

        yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat' +
                                   outputsuffix + '*')
        if yhat_filenames:
            yhat_filenames = fileio.sort_nicely(yhat_filenames)
            yhat_dfs = []
            for yhat_filename in yhat_filenames:
                yhat_dfs.append(pd.DataFrame(fileio.load(yhat_filename)))
            yhat_dfs = pd.concat(yhat_dfs, ignore_index=True, axis=1)
            fileio.save(
                yhat_dfs,
                processing_dir + 'yhat' + outputsuffix + file_extentions)
            del yhat_dfs

        ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2' +
                                  outputsuffix + '*')
        if ys2_filenames:
            ys2_filenames = fileio.sort_nicely(ys2_filenames)
            ys2_dfs = []
            for ys2_filename in ys2_filenames:
                ys2_dfs.append(pd.DataFrame(fileio.load(ys2_filename)))
            ys2_dfs = pd.concat(ys2_dfs, ignore_index=True, axis=1)
            fileio.save(
                ys2_dfs,
                processing_dir + 'ys2' + outputsuffix + file_extentions)
            del ys2_dfs

        rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'RMSE' +
                                   outputsuffix + '*')
        if rmse_filenames:
            rmse_filenames = fileio.sort_nicely(rmse_filenames)
            rmse_dfs = []
            for rmse_filename in rmse_filenames:
                rmse_dfs.append(pd.DataFrame(fileio.load(rmse_filename)))
            rmse_dfs = pd.concat(rmse_dfs, ignore_index=True, axis=0)
            fileio.save(
                rmse_dfs,
                processing_dir + 'RMSE' + outputsuffix + file_extentions)
            del rmse_dfs

        smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'SMSE' +
                                   outputsuffix + '*')
        if rmse_filenames:
            smse_filenames = fileio.sort_nicely(smse_filenames)
            smse_dfs = []
            for smse_filename in smse_filenames:
                smse_dfs.append(pd.DataFrame(fileio.load(smse_filename)))
            smse_dfs = pd.concat(smse_dfs, ignore_index=True, axis=0)
            fileio.save(
                smse_dfs,
                processing_dir + 'SMSE' + outputsuffix + file_extentions)
            del smse_dfs

        expv_filenames = glob.glob(processing_dir + 'batch_*/' + 'EXPV' +
                                   outputsuffix + '*')
        if expv_filenames:
            expv_filenames = fileio.sort_nicely(expv_filenames)
            expv_dfs = []
            for expv_filename in expv_filenames:
                expv_dfs.append(pd.DataFrame(fileio.load(expv_filename)))
            expv_dfs = pd.concat(expv_dfs, ignore_index=True, axis=0)
            fileio.save(
                expv_dfs,
                processing_dir + 'EXPV' + outputsuffix + file_extentions)
            del expv_dfs

        msll_filenames = glob.glob(processing_dir + 'batch_*/' + 'MSLL' +
                                   outputsuffix + '*')
        if msll_filenames:
            msll_filenames = fileio.sort_nicely(msll_filenames)
            msll_dfs = []
            for msll_filename in msll_filenames:
                msll_dfs.append(pd.DataFrame(fileio.load(msll_filename)))
            msll_dfs = pd.concat(msll_dfs, ignore_index=True, axis=0)
            fileio.save(
                msll_dfs,
                processing_dir + 'MSLL' + outputsuffix + file_extentions)
            del msll_dfs

        if func != 'predict':
            if not os.path.isdir(processing_dir + 'Models') and \
               os.path.exists(os.path.join(batches[0], 'Models')):
                os.mkdir(processing_dir + 'Models')

            meta_filenames = glob.glob(processing_dir + 'batch_*/Models/' +
                                       'meta_data.md')
            mY = []
            sY = []
            mX = []
            sX = []
            if meta_filenames:
                meta_filenames = fileio.sort_nicely(meta_filenames)
                with open(meta_filenames[0], 'rb') as file:
                    meta_data = pickle.load(file)
                if meta_data['standardize']:
                    for meta_filename in meta_filenames:
                        mY.append(meta_data['mean_resp'])
                        sY.append(meta_data['std_resp'])
                        mX.append(meta_data['mean_cov'])
                        sX.append(meta_data['std_cov'])
                    meta_data['mean_resp'] = np.stack(mY)
                    meta_data['std_resp'] = np.stack(sY)
                    meta_data['mean_cov'] = np.stack(mX)
                    meta_data['std_cov'] = np.stack(sX)

                with open(
                        os.path.join(processing_dir, 'Models', 'meta_data.md'),
                        'wb') as file:
                    pickle.dump(meta_data, file)

            batch_dirs = glob.glob(processing_dir + 'batch_*/')
            if batch_dirs:
                batch_dirs = fileio.sort_nicely(batch_dirs)
                for b, batch_dir in enumerate(batch_dirs):
                    src_files = glob.glob(batch_dir + 'Models/*.pkl')
                    src_files = fileio.sort_nicely(src_files)
                    for f, full_file_name in enumerate(src_files):
                        if os.path.isfile(full_file_name):
                            file_name = full_file_name.split('/')[-1]
                            n = file_name.split('_')
                            n[-1] = str(b * batch_size + f) + '.pkl'
                            n = '_'.join(n)
                            shutil.copy(full_file_name,
                                        processing_dir + 'Models/' + n)

    if not batch_fail:
        return 1
    else:
        return 0