Example #1
0
    def model(self, nthin=1, store=False, verbose=False):
        '''
        Infers growth parameters of interest (including diauxic shifts) by Gaussian Process fitting of data.

        Args:
            store (boolean): if True, certain data will be store as object's attributes
            diauxie (float): ratio of peak height (relative to maximum) used to call if diauxie occured or not

        Actions:
            modifies self.key, and may create self.latent and self.dlatent_dt objects
        '''

        # get user-defined parameters from config.py
        posterior_n = getValue('n_posterior_samples')

        # initialize variables for storing parameters and data
        data_ls, diauxie_dict = [], {}
        gp_params = initParamDf(self.key.index, 0)

        for sample_id in self.key.index:

            pid, well = self.key.loc[sample_id, ['Plate_ID', 'Well']].values

            smartPrint('Fitting {}\t{}'.format(pid, well), verbose)

            # extract sample
            args_dict = self.key.loc[sample_id, ['Well', 'Plate_ID']].to_dict()
            sample = self.extractGrowthData(args_dict)

            df = sample.time.join(sample.data)
            df.columns = ['Time', 'OD']

            # create GP object and analyze
            gm = GrowthModel(df=df,
                             baseline=sample.key.OD_Baseline.values,
                             ARD=False,
                             heteroscedastic=False,
                             nthin=nthin)

            curve = gm.run(name=sample_id)

            diauxie_dict[sample_id] = curve.params.pop('df_dx')
            gp_params.loc[sample_id, :] = curve.params

            # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative)
            if store: data_ls.append(curve.data())

        diauxie_df = mergeDiauxieDfs(diauxie_dict)

        # record results in object's key
        self.key = self.key.join(gp_params)
        self.key = pd.merge(self.key, diauxie_df, on='Sample_ID')

        # plotting needs transformed (or real) OD & GP fit, & may need GP derivative, save all as obejct attributes
        if store: self.gp_data = pd.concat(data_ls).reset_index(drop=True)

        return None
Example #2
0
    def executeRegression(self):
        '''
        Computes the log Bayes Factor and its null distribution (based on permutation tests).

        Args:
            data (pandas.DataFrame): each row is a single measurement (i.e. time point in a well), columns are variables
                and must include 'Time', 'OD'.
            hypothesis (dictionary): keys must be 'H0' and 'H1', values are lists of variables (must match data keys)
            nperm (int): number ofxec permutations to generate null distribution

        Returns:
            log_BF (float): log Bayes Factor = log (P(H1|D)/P(H0|D))
            null_distribution (list of floats): the null distribution for log Bayes Factor where variable of interest
                was permuted for a certain number of times (based on nperm).
        '''

        verbose = self.verbose
        hypothesis = self.hypothesis
        fix_noise = self.args.fix_noise
        nperm = self.args.number_permutations
        nthin = self.args.time_step_size

        data = self.data

        data0 = data.loc[:, ['OD'] + hypothesis['H0']]
        data1 = data.loc[:, ['OD'] + hypothesis['H1']]

        gm0 = GrowthModel(df=data0,
                          ARD=True,
                          heteroscedastic=fix_noise,
                          nthin=nthin,
                          logged=self.plate.mods.logged)
        gm1 = GrowthModel(df=data1,
                          ARD=True,
                          heteroscedastic=fix_noise,
                          nthin=nthin,
                          logged=self.plate.mods.logged)

        gm0, LL0 = gm0.run(predict=False)
        gm1, LL1 = gm1.run(predict=False)
        log_BF = LL1 - LL0

        self.log_BF = log_BF
        self.model = gm1
        self.LL0 = LL0
        self.LL1 = LL1
        self.log_BF_null_dist = None

        null_distribution = []
        to_permute = list(
            set(hypothesis['H1']).difference(set(hypothesis['H0'])))[0]
        for rep in range(nperm):
            smartPrint('Permutation #{}'.format(rep), verbose)
            null_distribution.append(gm1.permute(to_permute) - LL0)
        smartPrint('', verbose)
        if null_distribution: self.log_BF_null_dist = null_distribution
Example #3
0
def runCombinedGrowthFitting(data, mapping, directory, args, verbose=False):
    '''
    Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics.
        While runGrowthFitting() analyzes data one plate at a time, runCombinedGrowthFitting()
        can pool experimental replicates across different plates. The downside is that data
        summary must be merged and no 96-well plate grid figure can be produced.  

    Args:
        data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1)
            plus-one because Time is not an index but rather a column.
        mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p)
        directory (dictionary): keys are folder names, values are their paths
        args (dictionary): keys are arguments and value are user/default choices
        verbose (boolean)

    Action:
        saves summary text file(s) in summary folder in the parent directory.
        saves figures (PDFs) in figures folder in the parent directory.
        saves data text file(s) in derived folder in the parent directory.
    '''

    # if user did not pass file name for output, use time stamp, see selectFileName()
    filename = selectFileName(args['fout'])

    # pre-process data
    plate = prepDataForFitting(data, mapping, subtract_baseline=False)

    # which meta-data variables do you use to group replicates?
    combine_keys = args['pb'].split(',')
    missing_keys = [ii for ii in combine_keys if ii not in plate.key.columns]

    if missing_keys:
        msg = 'FATAL USER ERROR: The following keys {} are '.format(
            missing_keys)
        msg += 'missing from mapping files.'
        sys.exit(msg)

    # continue processing data
    plate.subtractBaseline(to_do=True,
                           poly=getValue('PolyFit'),
                           groupby=combine_keys)
    plate_key = plate.key.copy()
    plate_data = plate.data.copy()
    plate_time = plate.time.copy()
    plate_cond = plate_key.loc[:, combine_keys +
                               ['Group', 'Control']].drop_duplicates(
                                   combine_keys).reset_index(drop=True)

    smartPrint(
        'AMiGA detected {} unique conditions.\n'.format(plate_cond.shape[0]),
        verbose)

    data_ls, diauxie_dict = [], {}

    # get user-defined values from config.py
    dx_ratio_varb = getValue('diauxie_ratio_varb')
    dx_ratio_min = getValue('diauxie_ratio_min')
    posterior_n = getValue('n_posterior_samples')
    scale = getValue('params_scale')

    posterior = args['slf']
    fix_noise = args['fn']
    nthin = args['nthin']

    # initialize empty dataframes for storing growth parameters
    params_latent = initParamDf(plate_cond.index, complexity=0)
    params_sample = initParamDf(plate_cond.index, complexity=1)

    # for each unique condition based on user request
    for idx, condition in plate_cond.iterrows():

        # get list of sample IDs
        cond_dict = condition.drop(['Group', 'Control'])
        cond_dict = cond_dict.to_dict(
        )  # e.g. {'Substate':['D-Trehalose'],'PM':[1]}
        cond_idx = subsetDf(
            plate_key,
            cond_dict).index.values  # list of index values for N samples
        smartPrint('Fitting\n{}'.format(tidyDictPrint(cond_dict)), verbose)

        # get data and format for GP instance
        cond_data = plate_data.loc[:, list(cond_idx)]  # T x N
        cond_data = plate_time.join(cond_data)  # T x N+1

        cond_data = cond_data.melt(id_vars='Time',
                                   var_name='Sample_ID',
                                   value_name='OD')
        cond_data = cond_data.drop(
            ['Sample_ID'], axis=1)  # T*R x 2 (where R is number of replicates)
        cond_data = cond_data.dropna()

        gm = GrowthModel(df=cond_data,
                         ARD=True,
                         heteroscedastic=fix_noise,
                         nthin=nthin)  #,

        curve = gm.run(name=idx)

        # get parameter estimates using latent function
        diauxie_dict[idx] = curve.params.pop('df_dx')
        params_latent.loc[idx, :] = curve.params

        # get parameter estimates using samples fom the posterior distribution
        if posterior: params_sample.loc[idx, :] = curve.sample().posterior

        # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative)
        if args['sgd']:
            time = pd.DataFrame(gm.x_new, columns=['Time'])
            mu0, var0 = np.ravel(gm.y0), np.ravel(np.diag(gm.cov0))
            mu1, var1 = np.ravel(gm.y1), np.ravel(np.diag(gm.cov1))

            if fix_noise: sigma_noise = np.ravel(gm.error_new) + gm.noise
            else: sigma_noise = np.ravel([gm.noise] * time.shape[0])

            mu_var = pd.DataFrame(
                [mu0, var0, mu1, var1, sigma_noise],
                index=['mu', 'Sigma', 'mu1', 'Sigma1', 'Noise']).T
            gp_data = pd.DataFrame([list(condition.values)] * len(mu0),
                                   columns=condition.keys())
            gp_data = gp_data.join(time).join(mu_var)
            data_ls.append(gp_data)

    # summarize diauxie results
    diauxie_df = mergeDiauxieDfs(diauxie_dict)

    if posterior: gp_params = params_sample.join(params_latent['diauxie'])
    else: gp_params = params_latent

    # record results in object's key
    plate_cond = plate_cond.join(gp_params)
    plate_cond.index.name = 'Sample_ID'
    plate_cond = plate_cond.reset_index(drop=False)
    plate_cond = pd.merge(plate_cond, diauxie_df, on='Sample_ID')

    params = initParamList(0) + initParamList(1)
    params = list(set(params).intersection(set(plate_cond.keys())))

    df_params = plate_cond.drop(initDiauxieList(), axis=1).drop_duplicates()
    df_diauxie = plate_cond[plate_cond.diauxie == 1]
    df_diauxie = df_diauxie.drop(params, axis=1)
    df_diauxie = minimizeDiauxieReport(df_diauxie)

    summ_path = assembleFullName(directory['summary'], '', filename, 'summary',
                                 '.txt')
    diux_path = assembleFullName(directory['summary'], '', filename, 'diauxie',
                                 '.txt')

    # normalize parameters, if requested
    df_params = normalizePooledParameters(args, df_params)
    df_params = df_params.drop(['Group', 'Control'], 1)
    df_params = minimizeParameterReport(df_params)

    # save results
    df_params.to_csv(summ_path, sep='\t', header=True, index=False)
    if df_diauxie.shape[0] > 0:
        df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False)

    # save latent functions
    if args['sgd']:
        file_path = assembleFullName(directory['derived'], '', filename,
                                     'gp_data', '.txt')
        gp_data = pd.concat(data_ls, sort=False).reset_index(drop=True)
        gp_data.to_csv(file_path, sep='\t', header=True, index=True)

    return None
Example #4
0
    def savePredictions(self):
        '''
        Given model predictions of growth curves (for each unique set of conditions tested),
            describe the latent function and its derivative in terms of growth parameters. 
            Reports results in a file with {file_name}_params name in dir_path directory. 

        Args:
            model (GPy.models.gp_regression.GPRegression)
            data (pandas.DataFrame)
            hypothesis (dictionary): e.g. {'H0':['Time'],'H1':['Time','Substrate']}
            actor_dict (dictionary): mapping of unique values of variables to numerical integers
            posterior (boolean)
            save_latent (boolean)
            dir_path (str): path to directory
            file_name (str): file name

        Returns:
            x_full (pandas.DataFrame): 
            x_min (pandas.DataFrame):

        '''

        data = self.data
        model = self.model
        hypothesis = self.hypothesis
        factor_dict = self.factor_dict
        variable = self.target[0]
        confidence = getValue('confidence')  # confidence interval, e.g. 0.95

        posterior = self.args['slf']
        save_latent = self.args['sgd']
        fix_noise = self.args['fn']

        dir_path = self.paths_dict['dir']
        file_name = self.paths_dict['filename']

        # define hypothesis paraameters
        model_input = hypothesis['H1']  #grab minimal input data for prediction
        x_full = self.x_full
        x_min = self.x_min

        diauxie_dict = {}
        params_latent = initParamDf(x_min.index, complexity=0)
        params_sample = initParamDf(x_min.index, complexity=1)

        for idx, row in x_min.iterrows():

            # get x and y data
            df = subsetDf(x_full.drop(['mu', 'Sigma', 'Noise'], 1),
                          row.to_dict())

            # get curve based on model predictions
            gm = GrowthModel(model=model.model, x_new=df.values, ARD=True)
            curve = gm.run()

            # get parameter estimates using predicted curve
            diauxie_dict[idx] = curve.params.pop('df_dx')
            params_latent.loc[idx, :] = curve.params

            if posterior: params_sample.loc[idx, :] = curve.sample().posterior

        # summarize diauxie results
        diauxie_df = mergeDiauxieDfs(diauxie_dict)

        if posterior: gp_params = params_sample.join(params_latent['diauxie'])
        else: gp_params = params_latent

        gp_params = x_min.join(gp_params)
        gp_params.index.name = 'Sample_ID'
        gp_params = gp_params.reset_index(drop=False)
        gp_params = pd.merge(gp_params, diauxie_df, on='Sample_ID')

        # save gp_data fit
        x_out = x_full.copy()
        for key, mapping in factor_dict.items():
            if key in x_out.keys():
                x_out.loc[:,
                          key] = x_out.loc[:,
                                           key].replace(reverseDict(mapping))
            if key in gp_params.keys():
                gp_params.loc[:, key] = gp_params.loc[:, key].replace(
                    reverseDict(mapping))

        #params = initParamList(0)
        diauxie = initDiauxieList()
        params = initParamList(0) + initParamList(1)
        params = list(set(params).intersection(set(gp_params.keys())))

        df_params = gp_params.drop(diauxie, axis=1).drop_duplicates()
        df_params = minimizeParameterReport(df_params)
        df_diauxie = gp_params[gp_params.diauxie == 1].drop(params, axis=1)
        df_diauxie = minimizeDiauxieReport(df_diauxie)

        if posterior:
            df_params = prettyifyParameterReport(df_params, variable,
                                                 confidence)
            df_params = articulateParameters(df_params, axis=0)

        summ_path = assembleFullName(dir_path, '', file_name, 'params', '.txt')
        diux_path = assembleFullName(dir_path, '', file_name, 'diauxie',
                                     '.txt')

        #plate_cond.to_csv(file_path,sep='\t',header=True,index=True)
        df_params.to_csv(summ_path, sep='\t', header=True, index=posterior)
        if df_diauxie.shape[0] > 0:
            df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False)

        if save_latent:
            file_path = assembleFullName(dir_path, '', file_name, 'output',
                                         '.txt')
            x_out.to_csv(file_path, sep='\t', header=True, index=True)