Example #1
0
def setAxesLabels(ax, subtract_control, plot_params, fontsize=20):
    ''''
    Given an axis and analysis parameters, determine appropriate labels 
        for axes and adjus them accordingly. 

    Args:
        ax (matplotlib.axes._subplots.AxesSubplot) 
        subtract_control (boolean)
        plot_params (dictionary)
        fontsize (float)

    Returns:
        ax (matplotlib.axes._subplots.AxesSubplot) 
    '''

    if plot_params['plot_linear_od']:
        base = getValue('hypo_plot_y_label')
    else:
        base = 'ln {}'.format(getValue('hypo_plot_y_label'))

    # plot aesthetics
    if subtract_control:
        ylabel = 'Normalized {}'.format(base)
    else:
        ylabel = base

    ax.set_xlabel('Time ({})'.format(getTimeUnits('output')),
                  fontsize=plot_params['fontsize'])
    ax.set_ylabel(ylabel, fontsize=plot_params['fontsize'])

    return ax
Example #2
0
def describeVariance(df, time='X0', od='Y'):
    '''
    df columns ['X0','X1',...,'Y']
    values of Xs except fo X0 should be non-unique
    '''

    window = getValue('variance_smoothing_window')

    df = df.sort_values('Time')
    df.reset_index(drop=True, inplace=True)

    nX = len(df[time].drop_duplicates())
    nS = int(df.shape[0] / nX)

    sid = pd.DataFrame(np.ravel([np.arange(nS)] * nX), columns=['SID'])
    df = df.join(sid)

    tmp = pd.pivot(df, index=time, columns='SID', values=od)
    if window < 1: window = int(np.ceil(nX * window))

    var = np.var(tmp.values, 1)
    var = filters.gaussian_filter1d(var, window)

    df = df.sort_values(['SID', 'Time'])
    df.loc[:, 'error'] = np.ravel([var] * nS)

    return df
Example #3
0
    def prepRegressionPlate(self):
        '''
        Packages data into a growth.GrowthPlate() object and performs a select number of class functions.

        Args:
            data (pandas.DataFrame): t (number of measurements) by n+1 (number of samples + one column for time)
            mapping (pandas.DataFrame): n (number of samples) by p (number of variables)
            subtract_control (boolean)
            thinning_step (int): how many time points to skip between selected time points. 
        '''

        plate = GrowthPlate(self.master_data, self.master_mapping)
        plate.convertTimeUnits(input=getTimeUnits('input'),
                               output=getTimeUnits('output'))
        plate.logData()
        plate.subtractBaseline(to_do=True,
                               poly=getValue('PolyFit'),
                               groupby=list(self.non_time_varbs))
        plate.subtractControl(to_do=self.subtract_control, drop=True)
        plate.key.to_csv(self.paths_dict['key'],
                         sep='\t',
                         header=True,
                         index=True)  # save model results

        self.plate = plate
        self.ntimepoints = plate.time.shape[0]
Example #4
0
    def model(self, nthin=1, store=False, verbose=False):
        '''
        Infers growth parameters of interest (including diauxic shifts) by Gaussian Process fitting of data.

        Args:
            store (boolean): if True, certain data will be store as object's attributes
            diauxie (float): ratio of peak height (relative to maximum) used to call if diauxie occured or not

        Actions:
            modifies self.key, and may create self.latent and self.dlatent_dt objects
        '''

        # get user-defined parameters from config.py
        posterior_n = getValue('n_posterior_samples')

        # initialize variables for storing parameters and data
        data_ls, diauxie_dict = [], {}
        gp_params = initParamDf(self.key.index, 0)

        for sample_id in self.key.index:

            pid, well = self.key.loc[sample_id, ['Plate_ID', 'Well']].values

            smartPrint('Fitting {}\t{}'.format(pid, well), verbose)

            # extract sample
            args_dict = self.key.loc[sample_id, ['Well', 'Plate_ID']].to_dict()
            sample = self.extractGrowthData(args_dict)

            df = sample.time.join(sample.data)
            df.columns = ['Time', 'OD']

            # create GP object and analyze
            gm = GrowthModel(df=df,
                             baseline=sample.key.OD_Baseline.values,
                             ARD=False,
                             heteroscedastic=False,
                             nthin=nthin)

            curve = gm.run(name=sample_id)

            diauxie_dict[sample_id] = curve.params.pop('df_dx')
            gp_params.loc[sample_id, :] = curve.params

            # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative)
            if store: data_ls.append(curve.data())

        diauxie_df = mergeDiauxieDfs(diauxie_dict)

        # record results in object's key
        self.key = self.key.join(gp_params)
        self.key = pd.merge(self.key, diauxie_df, on='Sample_ID')

        # plotting needs transformed (or real) OD & GP fit, & may need GP derivative, save all as obejct attributes
        if store: self.gp_data = pd.concat(data_ls).reset_index(drop=True)

        return None
Example #5
0
def prepDataForFitting(data,
                       mapping,
                       subtract_baseline=True,
                       subtract_control=False,
                       subtract_blanks=False,
                       log_transform=False,
                       drop_flagged_wells=False):
    '''
    Packages data set into a grwoth.GrowthPlate() object and transforms data in preparation for GP fitting.

    Args:
        data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1)
            plus-one because Time is not an index but rather a column.
        mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p)
       
    Returns:
        plate (growth.GrwothPlate() object) 
    '''

    # merge data-sets for easier analysis and perform basic summaries and manipulations
    plate = GrowthPlate(data=data, key=mapping)

    plate.convertTimeUnits(input=getTimeUnits('input'),
                           output=getTimeUnits('output'))
    plate.computeBasicSummary()
    plate.computeFoldChange(subtract_baseline=subtract_baseline)
    plate.subtractControl(to_do=subtract_blanks,
                          drop=getValue('drop_blank_wells'),
                          blank=True)
    plate.subtractControl(to_do=subtract_control,
                          drop=getValue('drop_control_wells'),
                          blank=False)
    plate.raiseData(
    )  # replace non-positive values, necessary prior to log-transformation
    plate.logData(to_do=log_transform)  # natural-log transform
    plate.subtractBaseline(
        subtract_baseline,
        poly=False)  # subtract first T0 (or rather divide by first T0)
    plate.dropFlaggedWells(to_do=drop_flagged_wells)

    return plate
Example #6
0
def setAxesLabels(ax, subtract_control, plot_params, logged=True, fontsize=20):
    ''''
    Given an axis and analysis parameters, determine appropriate labels 
        for axes and adjus them accordingly. 

    Args:
        ax (matplotlib.axes._subplots.AxesSubplot) 
        subtract_control (boolean)
        plot_params (dictionary)
        fontsize (float)

    Returns:
        ax (matplotlib.axes._subplots.AxesSubplot) 
    '''
    import matplotlib as mpl
    mpl.rcParams["mathtext.default"] = 'regular'
    mpl.rcParams["font.family"] = 'sans-serif'
    mpl.rcParams["font.sans-serif"] = 'Arial'
    # mpl.rcParams["text.usetex"] = True

    #if plot_params['plot_linear_od']:
    #    base = getValue('hypo_plot_y_label')
    #    base = r'$\frac{{{}}}{{{}}}$'.format(base+'(t)',base+'(0)')
    #else:
    if logged: base = 'ln {}'.format(getValue('hypo_plot_y_label'))
    else: base = getValue('hypo_plot_y_label')

    # plot aesthetics
    if subtract_control:
        ylabel = 'Normalized {}'.format(base)
    else:
        ylabel = base

    ax.set_xlabel('Time ({})'.format(getTimeUnits('output')),
                  fontsize=plot_params['fontsize'])
    ax.set_ylabel(ylabel, fontsize=plot_params['fontsize'])

    return ax
Example #7
0
    def LagTime(self):
        '''
        Computes the lag time either the classical definition or a probabilistic definition.
            The former defines the lag time as the intersection with the axis parallel to time 
            of the tangent intersecting the derivative of the latent function at maximum growth. 
            This tangent has slope m equivalent to the maximum of the derivative of the latent.
            The latter defines lag time as the time at which the 95-percent credible interval of  
            the growth rate (i.e. derivative of latent) deviates from zero. 

        Args:
            mode (str): either 'Classical' or 'Probabilistic
            threshold (float): Confidence Interval, used for probabilistic inference of lag time.
        '''

        x = self.x
        y0 = self.y0
        y1 = self.y1
        cov1 = self.cov1

        # CLASSICAL MODE

        t_gr = self.t_gr  # time at maximal growth rate
        x_gr = int(np.where(x[:,
                              0] == t_gr)[0])  # index at maximal growth rate

        m1 = y1[x_gr]  # slope at maximal growth rate
        m0 = y0[x_gr]  # log OD at maximal growth rate

        if m1 == 0: lagC = np.inf  # no growth, then infinite lag
        else: lagC = (t_gr - (m0 / m1))[0]

        # PROBABILISTIC MODE

        confidence = getValue('confidence_adapt_time')

        prob = np.array([
            norm.cdf(0, m, np.sqrt(v)) for m, v in zip(y1[:, 0], np.diag(cov1))
        ])

        ind = 0
        while (ind < prob.shape[0]) and (prob[ind] > confidence):
            ind += 1

        if ind == prob.shape[0]: lagP = np.inf
        else: lagP = float(self.x[ind][0])

        self.lagC = lagC
        self.lagP = lagP
Example #8
0
def minimizeDiauxieReport(df):
    '''
    Minimizes a pandas.DataFrame to only inlcude parameters indicated in 
        the config.py file under 'report-parameters' variable. 

    Args (pandas.DataFrame)
    Return (pandas.DataFrame)
    '''

    request = getValue('report_parameters')
    request = initDiauxieList(request)

    lp = initDiauxieList()
    keys = set(lp).intersection(set(df.keys()))
    remove = keys.difference(set(request))

    return df.drop(remove,axis=1)
Example #9
0
    def sample(self):
        '''
        Sample the posterior distribution of the latent function and its derivative 
            n times, estimate growth parametes for each sample, then summarize with 
            mean and standard deviation. 
        '''

        n = getValue('n_posterior_samples')

        samples0 = np.random.multivariate_normal(self.y0.ravel(), self.cov0, n)
        samples1 = np.random.multivariate_normal(self.y1.ravel(), self.cov1, n)

        list_params = []

        for ii, y0, y1 in zip(range(n), samples0, samples1):

            y0_ii = y0[:, np.newaxis]
            y1_ii = y1[:, np.newaxis]

            curve_ii = GrowthCurve(x=self.x,
                                   y=self.y,
                                   y0=y0_ii,
                                   y1=y1_ii,
                                   cov0=self.cov0,
                                   cov1=self.cov1)
            list_params.append(curve_ii.params)

        df_params = pd.DataFrame(list_params)
        df_params_avg = df_params.mean()
        df_params_std = df_params.std()

        df_params_avg.index = [
            'mean({})'.format(ii) for ii in df_params_avg.index
        ]
        df_params_std.index = [
            'std({})'.format(ii) for ii in df_params_std.index
        ]

        self.posterior = pd.concat([df_params_avg, df_params_std]).to_dict()

        return self
Example #10
0
def describeVariance(df, time='X0', od='Y'):
    '''
    df columns ['X0','X1',...,'Y']
    values of Xs except fo X0 should be non-unique
    '''

    window = getValue('variance_smoothing_window')
    nX = len(df[time].drop_duplicates())
    if window < 1: window = int(np.ceil(nX * window))

    df = df.sort_values(time)
    df.reset_index(drop=True, inplace=True)

    error = df[[time, 'OD']]
    error = error.groupby([time]).apply(lambda x: np.nanvar(x.OD))
    error = pd.DataFrame(error, columns=['error'])
    error = error.reset_index()
    error = error.drop_duplicates().set_index(time).sort_index()
    error.loc[:, 'error'] = filters.gaussian_filter1d(error.error.values,
                                                      window)
    df = pd.merge(df, error, on=time, how='outer').sort_values([time])

    return df
Example #11
0
    def plot(self, ax_arg=None):

        if not ax_arg:
            fig, ax = plt.subplots(2, 1, figsize=[6, 8], sharex=True)
        else:
            ax = ax_arg

        t = self.x.ravel()
        y = self.y.ravel()
        y0 = self.y0.rave()
        y1 = self.y1.ravel()

        xmin = 0
        xmax = int(np.ceil(t[-1]))

        ax[0].plot(t, y, lw=5, color=(0, 0, 0, 0.65))
        ax[0].plot(t, y0, lw=5, color=(1, 0, 0, 0.65))
        ax[1].plot(t, y1, lw=5, color=(0, 0, 0, 0.65))

        [
            ii.set(fontsize=20)
            for ii in ax[0].get_xticklabels() + ax[0].get_yticklabels()
        ]
        [
            ii.set(fontsize=20)
            for ii in ax[1].get_xticklabels() + ax[1].get_yticklabels()
        ]

        ylabel = getValue('hypo_plot_y_label')
        ax[1].set_xlabel('Time', fontsize=20)
        ax[0].set_ylabel(ylabel, fontsize=20)
        ax[1].set_ylabel('d/dt {}'.format(ylabel), fontsize=20)

        ax[0].set_xlim([xmin, xmax])

        if not ax_arg: return fig, ax
        else: return ax
Example #12
0
def runCombinedGrowthFitting(data, mapping, directory, args, verbose=False):
    '''
    Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics.
        While runGrowthFitting() analyzes data one plate at a time, runCombinedGrowthFitting()
        can pool experimental replicates across different plates. The downside is that data
        summary must be merged and no 96-well plate grid figure can be produced.  

    Args:
        data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1)
            plus-one because Time is not an index but rather a column.
        mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p)
        directory (dictionary): keys are folder names, values are their paths
        args (dictionary): keys are arguments and value are user/default choices
        verbose (boolean)

    Action:
        saves summary text file(s) in summary folder in the parent directory.
        saves figures (PDFs) in figures folder in the parent directory.
        saves data text file(s) in derived folder in the parent directory.
    '''

    # if user did not pass file name for output, use time stamp, see selectFileName()
    filename = selectFileName(args['fout'])

    # pre-process data
    plate = prepDataForFitting(data, mapping, subtract_baseline=False)

    # which meta-data variables do you use to group replicates?
    combine_keys = args['pb'].split(',')
    missing_keys = [ii for ii in combine_keys if ii not in plate.key.columns]

    if missing_keys:
        msg = 'FATAL USER ERROR: The following keys {} are '.format(
            missing_keys)
        msg += 'missing from mapping files.'
        sys.exit(msg)

    # continue processing data
    plate.subtractBaseline(to_do=True,
                           poly=getValue('PolyFit'),
                           groupby=combine_keys)
    plate_key = plate.key.copy()
    plate_data = plate.data.copy()
    plate_time = plate.time.copy()
    plate_cond = plate_key.loc[:, combine_keys +
                               ['Group', 'Control']].drop_duplicates(
                                   combine_keys).reset_index(drop=True)

    smartPrint(
        'AMiGA detected {} unique conditions.\n'.format(plate_cond.shape[0]),
        verbose)

    data_ls, diauxie_dict = [], {}

    # get user-defined values from config.py
    dx_ratio_varb = getValue('diauxie_ratio_varb')
    dx_ratio_min = getValue('diauxie_ratio_min')
    posterior_n = getValue('n_posterior_samples')
    scale = getValue('params_scale')

    posterior = args['slf']
    fix_noise = args['fn']
    nthin = args['nthin']

    # initialize empty dataframes for storing growth parameters
    params_latent = initParamDf(plate_cond.index, complexity=0)
    params_sample = initParamDf(plate_cond.index, complexity=1)

    # for each unique condition based on user request
    for idx, condition in plate_cond.iterrows():

        # get list of sample IDs
        cond_dict = condition.drop(['Group', 'Control'])
        cond_dict = cond_dict.to_dict(
        )  # e.g. {'Substate':['D-Trehalose'],'PM':[1]}
        cond_idx = subsetDf(
            plate_key,
            cond_dict).index.values  # list of index values for N samples
        smartPrint('Fitting\n{}'.format(tidyDictPrint(cond_dict)), verbose)

        # get data and format for GP instance
        cond_data = plate_data.loc[:, list(cond_idx)]  # T x N
        cond_data = plate_time.join(cond_data)  # T x N+1

        cond_data = cond_data.melt(id_vars='Time',
                                   var_name='Sample_ID',
                                   value_name='OD')
        cond_data = cond_data.drop(
            ['Sample_ID'], axis=1)  # T*R x 2 (where R is number of replicates)
        cond_data = cond_data.dropna()

        gm = GrowthModel(df=cond_data,
                         ARD=True,
                         heteroscedastic=fix_noise,
                         nthin=nthin)  #,

        curve = gm.run(name=idx)

        # get parameter estimates using latent function
        diauxie_dict[idx] = curve.params.pop('df_dx')
        params_latent.loc[idx, :] = curve.params

        # get parameter estimates using samples fom the posterior distribution
        if posterior: params_sample.loc[idx, :] = curve.sample().posterior

        # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative)
        if args['sgd']:
            time = pd.DataFrame(gm.x_new, columns=['Time'])
            mu0, var0 = np.ravel(gm.y0), np.ravel(np.diag(gm.cov0))
            mu1, var1 = np.ravel(gm.y1), np.ravel(np.diag(gm.cov1))

            if fix_noise: sigma_noise = np.ravel(gm.error_new) + gm.noise
            else: sigma_noise = np.ravel([gm.noise] * time.shape[0])

            mu_var = pd.DataFrame(
                [mu0, var0, mu1, var1, sigma_noise],
                index=['mu', 'Sigma', 'mu1', 'Sigma1', 'Noise']).T
            gp_data = pd.DataFrame([list(condition.values)] * len(mu0),
                                   columns=condition.keys())
            gp_data = gp_data.join(time).join(mu_var)
            data_ls.append(gp_data)

    # summarize diauxie results
    diauxie_df = mergeDiauxieDfs(diauxie_dict)

    if posterior: gp_params = params_sample.join(params_latent['diauxie'])
    else: gp_params = params_latent

    # record results in object's key
    plate_cond = plate_cond.join(gp_params)
    plate_cond.index.name = 'Sample_ID'
    plate_cond = plate_cond.reset_index(drop=False)
    plate_cond = pd.merge(plate_cond, diauxie_df, on='Sample_ID')

    params = initParamList(0) + initParamList(1)
    params = list(set(params).intersection(set(plate_cond.keys())))

    df_params = plate_cond.drop(initDiauxieList(), axis=1).drop_duplicates()
    df_diauxie = plate_cond[plate_cond.diauxie == 1]
    df_diauxie = df_diauxie.drop(params, axis=1)
    df_diauxie = minimizeDiauxieReport(df_diauxie)

    summ_path = assembleFullName(directory['summary'], '', filename, 'summary',
                                 '.txt')
    diux_path = assembleFullName(directory['summary'], '', filename, 'diauxie',
                                 '.txt')

    # normalize parameters, if requested
    df_params = normalizePooledParameters(args, df_params)
    df_params = df_params.drop(['Group', 'Control'], 1)
    df_params = minimizeParameterReport(df_params)

    # save results
    df_params.to_csv(summ_path, sep='\t', header=True, index=False)
    if df_diauxie.shape[0] > 0:
        df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False)

    # save latent functions
    if args['sgd']:
        file_path = assembleFullName(directory['derived'], '', filename,
                                     'gp_data', '.txt')
        gp_data = pd.concat(data_ls, sort=False).reset_index(drop=True)
        gp_data.to_csv(file_path, sep='\t', header=True, index=True)

    return None
Example #13
0
def runGrowthFitting(data, mapping, directory, args, verbose=False):
    '''
    Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics.  

    Args:
        data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1)
            plus-one because Time is not an index but rather a column.
        mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p)
        directory (dictionary): keys are folder names, values are their paths
        args (dictionary): keys are arguments and value are user/default choices
        verbose (boolean)

    Action:
        saves summary text file(s) in summary folder in the parent directory.
        saves figures (PDFs) in figures folder in the parent directory.
        saves data text file(s) in derived folder in the parent directory.
    '''

    if args['pool']:
        runCombinedGrowthFitting(data,
                                 mapping,
                                 directory,
                                 args,
                                 verbose=verbose)
        return None

    # only store data if user requested its writing or requested plotting
    if args['sgd'] or args['plot'] or args['pd']: store = True
    else: store = False

    # if user requested merging of summary/data, store each plate's data/summary in temp directory first
    tmpdir = tempfile.mkdtemp()
    saved_umask = os.umask(
        0o77)  ## files can only be read/written by creator for security
    print('Temporary directory is {}\n'.format(tmpdir))

    # pre-process data
    plate = prepDataForFitting(data, mapping, subtract_baseline=True)

    dx_ratio_varb = getValue('diauxie_ratio_varb')
    dx_ratio_min = getValue('diauxie_ratio_min')

    ls_temp_files = []
    ls_summ_files = []
    ls_diux_files = []

    # for each plate, get samples and save individual text file for plate-specific summaries
    for pid in plate.key.Plate_ID.unique():

        smartPrint('Fitting {}'.format(pid), verbose)

        # grab plate-specific summary
        sub_plate = plate.extractGrowthData(args_dict={'Plate_ID': pid})

        # the primary motivation of this function: run gp model
        sub_plate.model(nthin=args['nthin'], store=store, verbose=verbose)

        # normalize parameters, if requested
        sub_plate.key = normalizeParameters(args, sub_plate.key)

        # save plots, if requested by user
        savePlots(sub_plate, args, directory, pid)

        # define file paths where data will be written
        if args['merges']:
            temp_path = assembleFullName(tmpdir, '', pid, 'gp_data', '.txt')
            summ_path = assembleFullName(tmpdir, '', pid, 'summary', '.txt')
            diux_path = assembleFullName(tmpdir, '', pid, 'diauxie', '.txt')
        else:
            temp_path = assembleFullName(directory['derived'], '', pid,
                                         'gp_data', '.txt')
            summ_path = assembleFullName(directory['summary'], '', pid,
                                         'summary', '.txt')
            diux_path = assembleFullName(directory['summary'], '', pid,
                                         'diauxie', '.txt')

        # save data, if requested by user
        savePlateData(args['sgd'], sub_plate, temp_path, summ_path, diux_path)

        # track all potentially created files
        ls_temp_files.append(temp_path)
        ls_summ_files.append(summ_path)
        ls_diux_files.append(diux_path)

    # if user did not pass file name for output, use time stamp, see selectFileName()
    filename = selectFileName(args['fout'])

    # if user requested merging, merge all files in temporary directory
    mergeSummaryData(args, directory, ls_temp_files, ls_summ_files,
                     ls_diux_files, filename)

    # remove temporary directory
    os.umask(saved_umask)
    os.rmdir(tmpdir)

    return None
Example #14
0
    def plot(self,
             save_path='',
             plot_fit=False,
             plot_derivative=False,
             plot_raw_with_fit=False):
        '''
        Creates a 8x12 grid plot (for 96-well plate) that shows the growth curves in each well.
            Plot aesthetics require several parameters that are saved in config.py and pulled using 
            functions in misc.py. Plot will be saved as a PDF to location passed via argument. Index
            column for object's key should be Well IDs but object's key should also have a Well column.

        Args:
            save_path (str): file path: if empty, plot will not be saved at all.
            plot_fit (boolean): whether to plot GP fits on top of raw OD.
            plot_derivative (boolean): if True, plot only the derivative of GP fit instead. 

        Returns:
            fig,axes: figure and axis handles.

        Action:
            if user passes save_path argument, plot will be saved as PDF in desired location 
        '''

        sns.set_style('whitegrid')

        self.addLocation()

        time = self.time

        cols = [
            'Sample_ID', 'Plate_ID', 'Well', 'Row', 'Column', 'Fold_Change',
            'OD_Max', 'OD_Baseline'
        ]
        key = self.key.reindex(
            cols,
            axis='columns',
        )
        key = key.dropna(axis=1, how='all')
        if 'Sample_ID' in key.columns:
            key = key.drop_duplicates().set_index('Sample_ID')

        # make sure plate is 96-well version, otherwise skip plotting
        if not self.isSingleMultiWellPlate():
            msg = 'WARNING: GrowthPlate() object for {} is not a 96-well plate. '.format(
                self.key.Plate_ID.iloc[0])
            msg += 'AMiGA can not plot it.\n'
            print(msg)
            return None

        if plot_derivative:
            base_y = self.gp_data.pivot(columns='Sample_ID',
                                        index='Time',
                                        values='GP_Derivative')
        elif plot_fit:
            base_y = self.gp_data.pivot(columns='Sample_ID',
                                        index='Time',
                                        values='OD_Growth_Data')
            overlay_y = self.gp_data.pivot(columns='Sample_ID',
                                           index='Time',
                                           values='OD_Growth_Fit')
        elif plot_raw_with_fit:
            base_y = self.gp_data.pivot(columns='Sample_ID',
                                        index='Time',
                                        values='OD_Data')
            overlay_y = self.gp_data.pivot(columns='Sample_ID',
                                           index='Time',
                                           values='OD_Fit')
        else:
            base_y = self.data  #gp_data.pivot(columns='Sample_ID',index='Time',values='OD_Data')

        fig, axes = plt.subplots(8, 12, figsize=[12, 8])

        # define window axis limits
        ymax = np.ceil(base_y.max(1).max())
        ymin = np.floor(base_y.min(1).min())

        if plot_fit: ymin = 0

        xmin = 0
        xmax = time.values[-1]
        xmax_up = int(np.ceil(xmax))  # round up to nearest integer

        for well in base_y.columns:

            # select proper sub-plot
            r, c = key.loc[well, ['Row', 'Column']] - 1
            ax = axes[r, c]

            # get colors based on fold-change and uration parameters
            if 'Fold_Change' in key.keys():
                color_l, color_f = getPlotColors(key.loc[well, 'Fold_Change'])
            else:
                color_l = getValue('fcn_line_color')
                color_f = getValue('fcn_face_color')

            # set window axis limits
            ax.set_xlim([xmin, xmax])
            ax.set_ylim([ymin, ymax])

            # define x-data and y-data points
            x = np.ravel(time.values)
            y = base_y.loc[:, well].values

            # plot line and fill_betwen, if plotting OD estimate
            ax.plot(x, y, color=color_l, lw=1.5, zorder=10)
            if not plot_derivative:
                ax.fill_between(x=x,
                                y1=[ax.get_ylim()[0]] * len(y),
                                y2=y,
                                color=color_f,
                                zorder=7)

            # add fit lines, if desired
            if plot_fit or plot_raw_with_fit:
                y_fit = overlay_y.loc[:, well].values
                ax.plot(x,
                        y_fit,
                        color='yellow',
                        alpha=0.65,
                        ls='--',
                        lw=1.5,
                        zorder=10)

            # show tick labels for bottom left subplot only, so by default no labels
            if plot_derivative:
                plt.setp(ax, yticks=[ymin, 0, ymax], yticklabels=[]
                         )  # zero derivative indicates no instantaneous growth
            else:
                plt.setp(ax, yticks=[ymin, ymax], yticklabels=[])
            plt.setp(ax, xticks=[xmin, xmax], xticklabels=[])

            # add well identifier on top left of each sub-plot
            well_color = getTextColors('Well_ID')
            ax.text(0.,
                    1.,
                    key.loc[well, 'Well'],
                    color=well_color,
                    ha='left',
                    va='top',
                    transform=ax.transAxes)

            # add Max OD value on top right of each sub-plot
            if self.mods.floored:
                od_max = key.loc[well, 'OD_Max'] - key.loc[well, 'OD_Baseline']
            else:
                od_max = key.loc[well, 'OD_Max']
            ax.text(1.,
                    1.,
                    "{0:.2f}".format(od_max),
                    color=getTextColors('OD_Max'),
                    ha='right',
                    va='top',
                    transform=ax.transAxes)

        # show tick labels for bottom left sub-plot only
        plt.setp(axes[7, 0], xticks=[0, xmax], xticklabels=[0, xmax_up])
        plt.setp(axes[7, 0], yticks=[ymin, ymax], yticklabels=[ymin, ymax])

        # add x- and y-labels and title
        ylabel_base = getValue('grid_plot_y_label')
        ylabel_mod = ['ln ' if self.mods.logged else ''][0]

        if plot_derivative: ylabel_text = 'd[ln{}]/dt'.format(ylabel_base)
        else: ylabel_text = ylabel_mod + ylabel_base

        # add labels and title
        fig.text(0.512,
                 0.07,
                 'Time ({})'.format(getTimeUnits('output')),
                 fontsize=15,
                 ha='center',
                 va='bottom')
        fig.text(0.100,
                 0.50,
                 ylabel_text,
                 fontsize=15,
                 ha='right',
                 va='center',
                 rotation='vertical')
        fig.suptitle(x=0.512,
                     y=0.93,
                     t=key.loc[well, 'Plate_ID'],
                     fontsize=15,
                     ha='center',
                     va='center')

        # if no file path passed, do not save
        if save_path != '': plt.savefig(save_path, bbox_inches='tight')

        self.key.drop(['Row', 'Column'], axis=1, inplace=True)

        plt.close()

        return fig, axes
Example #15
0
def detectDiauxie(x, y0, y1, y2, cov0, cov1, thresh, varb='K'):
    '''
    Decompose a growth curve into individual growth phases separated by OD inflection.
    
    Args:
        x (numpy.ndarray): time
        y0 (numpy.ndarray): mean of latent function
        y1 (numpy.ndarray): mean of first derivative of latent function (i.e. growth rate)
        y2 (numpy.ndarray): mean of second dderivative of latent function (e.g. acceleration)
        cov0 (numpy.ndarray): covariance of latent function
        cov1 (numpy.ndarray): covariance of first derivatie of latent function
        thresh (float): ?
        varb (str): use either 'K' or 'r' to threshold/call secondary growth curves
    
    Retuns:
        ret (pandas.DataFrame): dataframe summarizes each growth phase with following:
            t_left: time at left bound
            t_right: time at right bound
            K: total growth
            r: maximum growth rate
            r_left: growth rate at left bound
            r_right: growth rate at ight bound
    '''

    if varb == 'K':
        second_varb = 'r'
    else:
        second_varb = 'K'

    if x.ndim > 1:
        x = x[:, 0].ravel()  # assumes time is first dimension

    # indices for inflections
    ips = list(np.where(np.diff(np.sign(y2.ravel())))[0])

    if len(ips) == 0 or np.max(y0) < getValue('diauxie_k_min'):
        cols = ['t_left', 't_right', 'K', 'r', 'r_left', 'r_right']
        ret = pd.DataFrame(
            [x[0], x[-1],
             np.max((x)),
             np.max(y1), y1[0][0], y1[-1][0]],
            index=cols)
        return ret.T

    # types of inflections
    its = [
        np.sign(y2[ii + 1])[0] if ii <
        (len(y2) - 2) else -1 * np.sign(y2[ii - 1])[0] for ii in ips
    ]

    # pad edge cases
    ips, its = pad(ips, its, edge=1, length=len(y2))
    ips, its = pad(ips, its, edge=-1, length=len(y2))

    # convert data types
    ips = np.array([int(ii) for ii in ips])
    its = np.array(its)

    # define bounds of each growth stage
    starts = np.where(its == 1)[0][:-1]
    stops = starts + 2

    # initialize a summary dataframe and populate with bounds
    #ret = np.zeros((int(len(ips)/2),7))
    ret = np.zeros((len(starts), 7))
    ret[:, 0] = ips[starts]
    ret[:, 1] = ips[stops]

    # compute several metrics for growth stage (should I use absolute?)
    bounds = [(int(ii[0] + 1), int(ii[1] + 1)) for ii in ret]
    ret[:, 2] = [np.max((y0[l:r] - y0[l]))
                 for l, r in bounds]  # Total change in OD
    ret[:, 3] = [np.max(y1[l:r]) for l, r in bounds]  # max growth rate,
    ret[:, 4:6] = [[y1[l - 1], y1[r - 1]]
                   for l, r in bounds]  # growth rate at both bounds

    # define attraction of each growth stage: a growth stage is attrached
    #   to the adjacent gowth stage with the least difference in terms of
    #   growth rate at the shared bounds (relative to max growth rate
    #   within the bounds)
    ret[:, 6] = [
        -1 if np.abs(row[5] - row[3]) > np.abs(row[4] - row[3]) else 1
        for row in ret
    ]

    # annotate datafame and sort in ascending order
    cols = ['t_left', 't_right', 'K', 'r', 'r_left', 'r_right', 'attraction']
    #cols = ['ind0','ind1','y_delta','max_y1','y1(ind0)','y1(ind1)','attraction']
    ret = pd.DataFrame(ret, columns=cols)

    # how to deal with negative r or K
    #   if at least one value is nonzero positive
    if any(ii > 0 for ii in ret[varb].values):
        # starting with the smallest growth stage (smallest total change in OD):
        #    if it's K is smaller than a certain proportion of the max K
        #    merge with attractor, continue until all growth phases meet criteria
        while ret[varb].min() < thresh * ret[varb].max():

            ret = ret.sort_values(['t_left'])
            ret.iloc[0,
                     -1] = 1  # first phase is always attracted forward in time
            ret.iloc[
                -1, -1] = -1  # last phase is always attracted backward in time

            ret = ret.sort_values([varb, second_varb])
            idx = ret.index.values[0]
            att = ret.loc[idx, 'attraction']
            att = idx + att
            ret = mergePhases(ret, idx, att, varb=varb)

            # should you re-compute attraction?
    else:
        while ret.shape[0] > 1:  # coalescale all into a single curve
            ret = mergePhases(ret, 0, 1)

    # re-sort by time and convert array indices to time values
    ret = ret.sort_values(['t_left'])
    ret.iloc[:, 0] = ret.iloc[:, 0].apply(lambda i: x[int(i)])
    ret.iloc[:, 1] = ret.iloc[:, 1].apply(lambda i: x[int(i)])
    ret.drop('attraction', axis=1, inplace=True)

    return ret
Example #16
0
    def describe(self):

        dx_ratio_min = getValue('diauxie_ratio_min')
        dx_ratio_varb = getValue('diauxie_ratio_varb')

        self.AreaUnderCurve()
        self.CarryingCapacity()
        self.MaxGrowthRate()
        self.MinGrowthRate()
        self.LagTime()
        self.StationaryDelta()

        params = {
            'auc_lin': self.auc_lin,
            'auc_log': self.auc_log,
            'k_lin': self.K_lin,
            'k_log': self.K_log,
            't_k': self.t_K,
            'gr': self.gr,
            'dr': self.dr,
            'td': self.td,
            't_gr': self.t_gr,
            't_dr': self.t_dr,
            'death_lin': self.death_lin,
            'death_log': self.death_log,
            'lagC': self.lagC,
            'lagP': self.lagP
        }

        if self.y2 is not None:

            dx = detectDiauxie(self.x,
                               self.y0,
                               self.y1,
                               self.y2,
                               self.cov0,
                               self.cov1,
                               thresh=dx_ratio_min,
                               varb=dx_ratio_varb)

            # describe all phases
            df_dx = []
            for idx, row in dx.iterrows():
                t0, t1 = row['t_left'], row['t_right']  # indices
                t0, t1 = [np.where(self.x == ii)[0][0]
                          for ii in [t0, t1]]  # time at indices
                if (t0 == 0) and (t1 == (len(self.x) - 1)):
                    dx_params = params
                    dx_params['t0'] = row['t_left']
                    dx_params['tf'] = row['t_right']
                    df_dx.append(pd.DataFrame(dx_params, index=[idx]))
                else:
                    curve = GrowthCurve(x=self.x[t0:t1],
                                        y0=self.y0[t0:t1] - self.y0[t0],
                                        y1=self.y1[t0:t1],
                                        cov0=self.cov0[t0:t1, t0:t1],
                                        cov1=self.cov1[t0:t1, t0:t1])
                    dx_params = curve.params
                    dx_params['t0'] = row['t_left']
                    dx_params['tf'] = row['t_right']
                    df_dx.append(pd.DataFrame(dx_params, index=[idx]))

            df_dx = pd.concat(df_dx, axis=0)
            df_dx.columns = ['dx_{}'.format(ii) for ii in df_dx.columns]

            params.update({
                'diauxie': [1 if dx.shape[0] > 1 else 0][0],
                'df_dx': df_dx
            })

        self.params = params
Example #17
0
    def plotPredictions(self):
        '''
        Visualizes the model tested by a specific hypothesis given the data.

        Args:
            x_full (pandas.DataFrame)
            x_min (pandas.DataFrame)
            hypotheis (dictionary): keys are str(H0) and str(H1), values are lists of str
            plate (growth.GrowthPlate obj))
            variable (list): variables of interest
            factor_dict (dictionary): mapping of unique values of variables to numerical integers
            subtract_control (boolean): where control sample curves subtracted from treatment sample curves
            file_name (str): 
            directory (str): path where files/figures should be stored
            args_dict (dictionary): must at least include 'nperm', 'nthin', and 'fdr' as keys and their values

        Action:
            saves a plot as PDF file
        '''

        # get necessary attributs
        x_full = self.x_full
        x_min = self.x_min
        factor_dict = self.factor_dict
        hypothesis = self.hypothesis
        variable = self.target[0]
        plate = self.plate

        subtract_control = self.subtract_control
        directory = self.paths_dict['dir']
        file_name = self.paths_dict['filename']

        # get and modify user-accessible parameters from config.py
        plot_params = getHypoPlotParams()  # dict
        tick_spacing = plot_params['tick_spacing']
        legend_loc = plot_params['legend']
        fontsize = plot_params['fontsize']

        posterior_n = getValue('n_posterior_samples')
        colors = getValue('hypo_colors')  # list of colors
        confidence = getValue('confidence')  # confidence interval, e.g. 0.95
        confidence = 1 - (1 - confidence) / 2

        noise = self.args['noise']

        if self.args['dp']: return None

        # grab mapping of integer codes in design matrix to actual variable labels
        varb_codes_map = reverseDict(factor_dict[variable])  # {codes:vlaues}
        cond_variables = list(
            set(hypothesis['H1']).difference(set(
                ['Time', variable])))  # conditioning variables

        # set figure aesthetics
        sns.set_style('whitegrid')
        rcParams['font.family'] = 'sans-serif'
        rcParams['font.sans-serif'] = 'Arial'

        # initialize grid
        fig, ax = plt.subplots(2, 1, figsize=[5, 10.5], sharex=False)

        # for each unique value of variable of interest, plot MVN prediction
        list_values = varb_codes_map.items()
        list_colors = colors[0:x_min.shape[0]]

        # plot MVN predictions
        for v_map, color in zip(list_values, list_colors):
            code, label = v_map
            criteria_real = {variable: [label]}
            criteria_mvn = {variable: code}

            ax[0] = addRealPlotLine(ax[0], plate, criteria_real, color,
                                    plot_params)
            ax[0] = addMVNPlotLine(ax[0], x_full, criteria_mvn, label,
                                   confidence, color, plot_params, noise)
            ax[0].xaxis.set_major_locator(MultipleLocator(tick_spacing))

        # adjust labels and window limits
        ax[0] = setAxesLabels(ax[0], subtract_control, plot_params)

        # if variable has only 2 values and if requested, plot delta OD
        if (len(list_values) != 2) or (not self.args['pdo']):
            fig.delaxes(ax[1])
            dos = None
        else:
            ax[1] = plotDeltaOD(ax[1],
                                self.functional_diff,
                                ylabel=True,
                                xlabel=True,
                                fontsize=fontsize)
            ax[1].xaxis.set_major_locator(MultipleLocator(tick_spacing))
            ax[0].set_xlabel('')

        ax = dynamicWindowAdjustment(ax)

        ## if user did not pass file name for output, use time stamp
        fig_path = assemblePath(directory, file_name, '.pdf')
        plt.subplots_adjust(wspace=0.15, hspace=0.15)
        savePlotWithLegends(ax[0], fig_path, legend_loc, fontsize=fontsize)
Example #18
0
    def computeFullDifference(self):
        '''
        Computes the full difference between two latent function (modelling growth curves).

        Args:
            x_diff (pandas.DataFrame): must include columns of Time, mu (mean of latent 
                function), Sigma (diagonal covariance of latent function)
            variable (str): variable of interest, must be a column name in x_diff
            confidence (float [0.0,1.0]): confidence interval, e.g. 0.95 for 95%.
            n (int): number of samples from posterior distribution
            posterior (boolean), whether to sample from posterior distribution
            noise (boolean): whether to plot 95-pct credibel intervals including sample uncertainty

        Returns:
            df (pandas.DataFrame)
            delta_od_sum (float): ||OD(t)||^2 which is defined as the sum of squares 
                for the OD when the mean and its credible interval deviates from zero.
        '''

        x_diff = self.x_full
        variable = self.target[0]
        confidence = getValue('confidence')  # confidence interval, e.g. 0.95
        confidence = 1 - (1 - confidence) / 2
        noise = self.args['noise']
        posterior_n = getValue('n_posterior_samples')
        save_latent = self.args['sgd']
        factor_dict = self.factor_dict

        def buildTestMatrix(x_time):
            '''
            Build a test matrix to simlpify OD full difference computation.
                See https://github.com/ptonner/gp_growth_phenotype/testStatistic.py 
                This is used to compare two growth latent functions. The differeence between
                first time points (measurements) are adjusted to zero. 
            Args:
                x_time (pandas.DataFrame or pandas.Series or numpy.ndarray), ndim > 1
            Returns:
                A (numpy.ndarray): N-1 x 2*N where N is length of time.
            '''

            # buildtestmatrix
            n = x_time.shape[0]
            A = np.zeros((n - 1, 2 * n))
            A[:, 0] = 1
            A[range(n - 1), range(1, n)] = -1
            A[:, n] = -1
            A[range(n - 1), n + np.arange(1, n)] = 1

            return A

        x_diff = x_diff.sort_values(
            [variable, 'Time'])  # do you really need to sort by variable
        x_time = x_diff.Time.drop_duplicates()

        # define mean and covariance of data
        mu = x_diff['mu'].values
        if noise: Sigma = np.diag(x_diff['Sigma'] + x_diff['Noise'])
        else: Sigma = np.diag(x_diff['Sigma'])

        # define mean and covariance of functional diffeence
        A = buildTestMatrix(x_time)
        m = np.dot(A, mu)
        c = np.dot(A, np.dot(Sigma, A.T))
        mean, std = m, np.sqrt(np.diag(c))

        # sample the curve for the difference between functions, from an MVN distribution
        n = getValue('n_posterior_samples')
        samples = np.random.multivariate_normal(m, c, n)

        # compute the sum of functional differences for all sampled curves
        dos = [np.sqrt(np.sum([ii**2 for ii in s])) for s in samples]
        dos_mu, dos_std = np.mean(dos), np.std(dos)
        dos_actual = np.sqrt(np.sum([ii**2 for ii in m]))

        # compute the confidence interval for the sum of functional differences
        scaler = norm.ppf(
            confidence
        )  # define confidence interval scaler for MVN predictions
        ci = (dos_mu - scaler * dos_std, dos_mu + scaler * dos_std)

        # compute credible intervals for the curve of the difference
        y_avg = mean
        y_low = y_avg - scaler * std  #
        y_upp = y_avg + scaler * std

        # package results
        t = x_time[1:].values
        df = pd.DataFrame([t, y_avg, y_low, y_upp],
                          index=['Time', 'Avg', 'Low', 'Upp']).T

        self.functional_diff = df
        self.delta_od_sum_mean = dos_mu
        self.delta_od_sum_ci = ci

        # save gp_data fit
        dir_path = self.paths_dict['dir']
        file_name = self.paths_dict['filename']
        if save_latent:
            file_path = assembleFullName(dir_path, '', file_name, 'func_diff',
                                         '.txt')
            df.to_csv(file_path, sep='\t', header=True, index=True)
Example #19
0
    def savePredictions(self):
        '''
        Given model predictions of growth curves (for each unique set of conditions tested),
            describe the latent function and its derivative in terms of growth parameters. 
            Reports results in a file with {file_name}_params name in dir_path directory. 

        Args:
            model (GPy.models.gp_regression.GPRegression)
            data (pandas.DataFrame)
            hypothesis (dictionary): e.g. {'H0':['Time'],'H1':['Time','Substrate']}
            actor_dict (dictionary): mapping of unique values of variables to numerical integers
            posterior (boolean)
            save_latent (boolean)
            dir_path (str): path to directory
            file_name (str): file name

        Returns:
            x_full (pandas.DataFrame): 
            x_min (pandas.DataFrame):

        '''

        data = self.data
        model = self.model
        hypothesis = self.hypothesis
        factor_dict = self.factor_dict
        variable = self.target[0]
        confidence = getValue('confidence')  # confidence interval, e.g. 0.95

        posterior = self.args['slf']
        save_latent = self.args['sgd']
        fix_noise = self.args['fn']

        dir_path = self.paths_dict['dir']
        file_name = self.paths_dict['filename']

        # define hypothesis paraameters
        model_input = hypothesis['H1']  #grab minimal input data for prediction
        x_full = self.x_full
        x_min = self.x_min

        diauxie_dict = {}
        params_latent = initParamDf(x_min.index, complexity=0)
        params_sample = initParamDf(x_min.index, complexity=1)

        for idx, row in x_min.iterrows():

            # get x and y data
            df = subsetDf(x_full.drop(['mu', 'Sigma', 'Noise'], 1),
                          row.to_dict())

            # get curve based on model predictions
            gm = GrowthModel(model=model.model, x_new=df.values, ARD=True)
            curve = gm.run()

            # get parameter estimates using predicted curve
            diauxie_dict[idx] = curve.params.pop('df_dx')
            params_latent.loc[idx, :] = curve.params

            if posterior: params_sample.loc[idx, :] = curve.sample().posterior

        # summarize diauxie results
        diauxie_df = mergeDiauxieDfs(diauxie_dict)

        if posterior: gp_params = params_sample.join(params_latent['diauxie'])
        else: gp_params = params_latent

        gp_params = x_min.join(gp_params)
        gp_params.index.name = 'Sample_ID'
        gp_params = gp_params.reset_index(drop=False)
        gp_params = pd.merge(gp_params, diauxie_df, on='Sample_ID')

        # save gp_data fit
        x_out = x_full.copy()
        for key, mapping in factor_dict.items():
            if key in x_out.keys():
                x_out.loc[:,
                          key] = x_out.loc[:,
                                           key].replace(reverseDict(mapping))
            if key in gp_params.keys():
                gp_params.loc[:, key] = gp_params.loc[:, key].replace(
                    reverseDict(mapping))

        #params = initParamList(0)
        diauxie = initDiauxieList()
        params = initParamList(0) + initParamList(1)
        params = list(set(params).intersection(set(gp_params.keys())))

        df_params = gp_params.drop(diauxie, axis=1).drop_duplicates()
        df_params = minimizeParameterReport(df_params)
        df_diauxie = gp_params[gp_params.diauxie == 1].drop(params, axis=1)
        df_diauxie = minimizeDiauxieReport(df_diauxie)

        if posterior:
            df_params = prettyifyParameterReport(df_params, variable,
                                                 confidence)
            df_params = articulateParameters(df_params, axis=0)

        summ_path = assembleFullName(dir_path, '', file_name, 'params', '.txt')
        diux_path = assembleFullName(dir_path, '', file_name, 'diauxie',
                                     '.txt')

        #plate_cond.to_csv(file_path,sep='\t',header=True,index=True)
        df_params.to_csv(summ_path, sep='\t', header=True, index=posterior)
        if df_diauxie.shape[0] > 0:
            df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False)

        if save_latent:
            file_path = assembleFullName(dir_path, '', file_name, 'output',
                                         '.txt')
            x_out.to_csv(file_path, sep='\t', header=True, index=True)
Example #20
0
#   predict_y2
#   run

import warnings
import numpy as np
import pandas as pd

from GPy.models import GPRegression

from scipy.ndimage import filters

from libs.kernel import buildKernel, addFixedKernel
from libs.curve import GrowthCurve
from libs.utils import uniqueRandomString, subsetDf, getValue

if getValue('Ignore_RuntimeWarning'):
    warnings.filterwarnings("ignore", category=RuntimeWarning)


def describeVariance(df, time='X0', od='Y'):
    '''
    df columns ['X0','X1',...,'Y']
    values of Xs except fo X0 should be non-unique
    '''

    window = getValue('variance_smoothing_window')

    df = df.sort_values('Time')
    df.reset_index(drop=True, inplace=True)

    nX = len(df[time].drop_duplicates())