def setAxesLabels(ax, subtract_control, plot_params, fontsize=20): '''' Given an axis and analysis parameters, determine appropriate labels for axes and adjus them accordingly. Args: ax (matplotlib.axes._subplots.AxesSubplot) subtract_control (boolean) plot_params (dictionary) fontsize (float) Returns: ax (matplotlib.axes._subplots.AxesSubplot) ''' if plot_params['plot_linear_od']: base = getValue('hypo_plot_y_label') else: base = 'ln {}'.format(getValue('hypo_plot_y_label')) # plot aesthetics if subtract_control: ylabel = 'Normalized {}'.format(base) else: ylabel = base ax.set_xlabel('Time ({})'.format(getTimeUnits('output')), fontsize=plot_params['fontsize']) ax.set_ylabel(ylabel, fontsize=plot_params['fontsize']) return ax
def describeVariance(df, time='X0', od='Y'): ''' df columns ['X0','X1',...,'Y'] values of Xs except fo X0 should be non-unique ''' window = getValue('variance_smoothing_window') df = df.sort_values('Time') df.reset_index(drop=True, inplace=True) nX = len(df[time].drop_duplicates()) nS = int(df.shape[0] / nX) sid = pd.DataFrame(np.ravel([np.arange(nS)] * nX), columns=['SID']) df = df.join(sid) tmp = pd.pivot(df, index=time, columns='SID', values=od) if window < 1: window = int(np.ceil(nX * window)) var = np.var(tmp.values, 1) var = filters.gaussian_filter1d(var, window) df = df.sort_values(['SID', 'Time']) df.loc[:, 'error'] = np.ravel([var] * nS) return df
def prepRegressionPlate(self): ''' Packages data into a growth.GrowthPlate() object and performs a select number of class functions. Args: data (pandas.DataFrame): t (number of measurements) by n+1 (number of samples + one column for time) mapping (pandas.DataFrame): n (number of samples) by p (number of variables) subtract_control (boolean) thinning_step (int): how many time points to skip between selected time points. ''' plate = GrowthPlate(self.master_data, self.master_mapping) plate.convertTimeUnits(input=getTimeUnits('input'), output=getTimeUnits('output')) plate.logData() plate.subtractBaseline(to_do=True, poly=getValue('PolyFit'), groupby=list(self.non_time_varbs)) plate.subtractControl(to_do=self.subtract_control, drop=True) plate.key.to_csv(self.paths_dict['key'], sep='\t', header=True, index=True) # save model results self.plate = plate self.ntimepoints = plate.time.shape[0]
def model(self, nthin=1, store=False, verbose=False): ''' Infers growth parameters of interest (including diauxic shifts) by Gaussian Process fitting of data. Args: store (boolean): if True, certain data will be store as object's attributes diauxie (float): ratio of peak height (relative to maximum) used to call if diauxie occured or not Actions: modifies self.key, and may create self.latent and self.dlatent_dt objects ''' # get user-defined parameters from config.py posterior_n = getValue('n_posterior_samples') # initialize variables for storing parameters and data data_ls, diauxie_dict = [], {} gp_params = initParamDf(self.key.index, 0) for sample_id in self.key.index: pid, well = self.key.loc[sample_id, ['Plate_ID', 'Well']].values smartPrint('Fitting {}\t{}'.format(pid, well), verbose) # extract sample args_dict = self.key.loc[sample_id, ['Well', 'Plate_ID']].to_dict() sample = self.extractGrowthData(args_dict) df = sample.time.join(sample.data) df.columns = ['Time', 'OD'] # create GP object and analyze gm = GrowthModel(df=df, baseline=sample.key.OD_Baseline.values, ARD=False, heteroscedastic=False, nthin=nthin) curve = gm.run(name=sample_id) diauxie_dict[sample_id] = curve.params.pop('df_dx') gp_params.loc[sample_id, :] = curve.params # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative) if store: data_ls.append(curve.data()) diauxie_df = mergeDiauxieDfs(diauxie_dict) # record results in object's key self.key = self.key.join(gp_params) self.key = pd.merge(self.key, diauxie_df, on='Sample_ID') # plotting needs transformed (or real) OD & GP fit, & may need GP derivative, save all as obejct attributes if store: self.gp_data = pd.concat(data_ls).reset_index(drop=True) return None
def prepDataForFitting(data, mapping, subtract_baseline=True, subtract_control=False, subtract_blanks=False, log_transform=False, drop_flagged_wells=False): ''' Packages data set into a grwoth.GrowthPlate() object and transforms data in preparation for GP fitting. Args: data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1) plus-one because Time is not an index but rather a column. mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p) Returns: plate (growth.GrwothPlate() object) ''' # merge data-sets for easier analysis and perform basic summaries and manipulations plate = GrowthPlate(data=data, key=mapping) plate.convertTimeUnits(input=getTimeUnits('input'), output=getTimeUnits('output')) plate.computeBasicSummary() plate.computeFoldChange(subtract_baseline=subtract_baseline) plate.subtractControl(to_do=subtract_blanks, drop=getValue('drop_blank_wells'), blank=True) plate.subtractControl(to_do=subtract_control, drop=getValue('drop_control_wells'), blank=False) plate.raiseData( ) # replace non-positive values, necessary prior to log-transformation plate.logData(to_do=log_transform) # natural-log transform plate.subtractBaseline( subtract_baseline, poly=False) # subtract first T0 (or rather divide by first T0) plate.dropFlaggedWells(to_do=drop_flagged_wells) return plate
def setAxesLabels(ax, subtract_control, plot_params, logged=True, fontsize=20): '''' Given an axis and analysis parameters, determine appropriate labels for axes and adjus them accordingly. Args: ax (matplotlib.axes._subplots.AxesSubplot) subtract_control (boolean) plot_params (dictionary) fontsize (float) Returns: ax (matplotlib.axes._subplots.AxesSubplot) ''' import matplotlib as mpl mpl.rcParams["mathtext.default"] = 'regular' mpl.rcParams["font.family"] = 'sans-serif' mpl.rcParams["font.sans-serif"] = 'Arial' # mpl.rcParams["text.usetex"] = True #if plot_params['plot_linear_od']: # base = getValue('hypo_plot_y_label') # base = r'$\frac{{{}}}{{{}}}$'.format(base+'(t)',base+'(0)') #else: if logged: base = 'ln {}'.format(getValue('hypo_plot_y_label')) else: base = getValue('hypo_plot_y_label') # plot aesthetics if subtract_control: ylabel = 'Normalized {}'.format(base) else: ylabel = base ax.set_xlabel('Time ({})'.format(getTimeUnits('output')), fontsize=plot_params['fontsize']) ax.set_ylabel(ylabel, fontsize=plot_params['fontsize']) return ax
def LagTime(self): ''' Computes the lag time either the classical definition or a probabilistic definition. The former defines the lag time as the intersection with the axis parallel to time of the tangent intersecting the derivative of the latent function at maximum growth. This tangent has slope m equivalent to the maximum of the derivative of the latent. The latter defines lag time as the time at which the 95-percent credible interval of the growth rate (i.e. derivative of latent) deviates from zero. Args: mode (str): either 'Classical' or 'Probabilistic threshold (float): Confidence Interval, used for probabilistic inference of lag time. ''' x = self.x y0 = self.y0 y1 = self.y1 cov1 = self.cov1 # CLASSICAL MODE t_gr = self.t_gr # time at maximal growth rate x_gr = int(np.where(x[:, 0] == t_gr)[0]) # index at maximal growth rate m1 = y1[x_gr] # slope at maximal growth rate m0 = y0[x_gr] # log OD at maximal growth rate if m1 == 0: lagC = np.inf # no growth, then infinite lag else: lagC = (t_gr - (m0 / m1))[0] # PROBABILISTIC MODE confidence = getValue('confidence_adapt_time') prob = np.array([ norm.cdf(0, m, np.sqrt(v)) for m, v in zip(y1[:, 0], np.diag(cov1)) ]) ind = 0 while (ind < prob.shape[0]) and (prob[ind] > confidence): ind += 1 if ind == prob.shape[0]: lagP = np.inf else: lagP = float(self.x[ind][0]) self.lagC = lagC self.lagP = lagP
def minimizeDiauxieReport(df): ''' Minimizes a pandas.DataFrame to only inlcude parameters indicated in the config.py file under 'report-parameters' variable. Args (pandas.DataFrame) Return (pandas.DataFrame) ''' request = getValue('report_parameters') request = initDiauxieList(request) lp = initDiauxieList() keys = set(lp).intersection(set(df.keys())) remove = keys.difference(set(request)) return df.drop(remove,axis=1)
def sample(self): ''' Sample the posterior distribution of the latent function and its derivative n times, estimate growth parametes for each sample, then summarize with mean and standard deviation. ''' n = getValue('n_posterior_samples') samples0 = np.random.multivariate_normal(self.y0.ravel(), self.cov0, n) samples1 = np.random.multivariate_normal(self.y1.ravel(), self.cov1, n) list_params = [] for ii, y0, y1 in zip(range(n), samples0, samples1): y0_ii = y0[:, np.newaxis] y1_ii = y1[:, np.newaxis] curve_ii = GrowthCurve(x=self.x, y=self.y, y0=y0_ii, y1=y1_ii, cov0=self.cov0, cov1=self.cov1) list_params.append(curve_ii.params) df_params = pd.DataFrame(list_params) df_params_avg = df_params.mean() df_params_std = df_params.std() df_params_avg.index = [ 'mean({})'.format(ii) for ii in df_params_avg.index ] df_params_std.index = [ 'std({})'.format(ii) for ii in df_params_std.index ] self.posterior = pd.concat([df_params_avg, df_params_std]).to_dict() return self
def describeVariance(df, time='X0', od='Y'): ''' df columns ['X0','X1',...,'Y'] values of Xs except fo X0 should be non-unique ''' window = getValue('variance_smoothing_window') nX = len(df[time].drop_duplicates()) if window < 1: window = int(np.ceil(nX * window)) df = df.sort_values(time) df.reset_index(drop=True, inplace=True) error = df[[time, 'OD']] error = error.groupby([time]).apply(lambda x: np.nanvar(x.OD)) error = pd.DataFrame(error, columns=['error']) error = error.reset_index() error = error.drop_duplicates().set_index(time).sort_index() error.loc[:, 'error'] = filters.gaussian_filter1d(error.error.values, window) df = pd.merge(df, error, on=time, how='outer').sort_values([time]) return df
def plot(self, ax_arg=None): if not ax_arg: fig, ax = plt.subplots(2, 1, figsize=[6, 8], sharex=True) else: ax = ax_arg t = self.x.ravel() y = self.y.ravel() y0 = self.y0.rave() y1 = self.y1.ravel() xmin = 0 xmax = int(np.ceil(t[-1])) ax[0].plot(t, y, lw=5, color=(0, 0, 0, 0.65)) ax[0].plot(t, y0, lw=5, color=(1, 0, 0, 0.65)) ax[1].plot(t, y1, lw=5, color=(0, 0, 0, 0.65)) [ ii.set(fontsize=20) for ii in ax[0].get_xticklabels() + ax[0].get_yticklabels() ] [ ii.set(fontsize=20) for ii in ax[1].get_xticklabels() + ax[1].get_yticklabels() ] ylabel = getValue('hypo_plot_y_label') ax[1].set_xlabel('Time', fontsize=20) ax[0].set_ylabel(ylabel, fontsize=20) ax[1].set_ylabel('d/dt {}'.format(ylabel), fontsize=20) ax[0].set_xlim([xmin, xmax]) if not ax_arg: return fig, ax else: return ax
def runCombinedGrowthFitting(data, mapping, directory, args, verbose=False): ''' Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics. While runGrowthFitting() analyzes data one plate at a time, runCombinedGrowthFitting() can pool experimental replicates across different plates. The downside is that data summary must be merged and no 96-well plate grid figure can be produced. Args: data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1) plus-one because Time is not an index but rather a column. mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p) directory (dictionary): keys are folder names, values are their paths args (dictionary): keys are arguments and value are user/default choices verbose (boolean) Action: saves summary text file(s) in summary folder in the parent directory. saves figures (PDFs) in figures folder in the parent directory. saves data text file(s) in derived folder in the parent directory. ''' # if user did not pass file name for output, use time stamp, see selectFileName() filename = selectFileName(args['fout']) # pre-process data plate = prepDataForFitting(data, mapping, subtract_baseline=False) # which meta-data variables do you use to group replicates? combine_keys = args['pb'].split(',') missing_keys = [ii for ii in combine_keys if ii not in plate.key.columns] if missing_keys: msg = 'FATAL USER ERROR: The following keys {} are '.format( missing_keys) msg += 'missing from mapping files.' sys.exit(msg) # continue processing data plate.subtractBaseline(to_do=True, poly=getValue('PolyFit'), groupby=combine_keys) plate_key = plate.key.copy() plate_data = plate.data.copy() plate_time = plate.time.copy() plate_cond = plate_key.loc[:, combine_keys + ['Group', 'Control']].drop_duplicates( combine_keys).reset_index(drop=True) smartPrint( 'AMiGA detected {} unique conditions.\n'.format(plate_cond.shape[0]), verbose) data_ls, diauxie_dict = [], {} # get user-defined values from config.py dx_ratio_varb = getValue('diauxie_ratio_varb') dx_ratio_min = getValue('diauxie_ratio_min') posterior_n = getValue('n_posterior_samples') scale = getValue('params_scale') posterior = args['slf'] fix_noise = args['fn'] nthin = args['nthin'] # initialize empty dataframes for storing growth parameters params_latent = initParamDf(plate_cond.index, complexity=0) params_sample = initParamDf(plate_cond.index, complexity=1) # for each unique condition based on user request for idx, condition in plate_cond.iterrows(): # get list of sample IDs cond_dict = condition.drop(['Group', 'Control']) cond_dict = cond_dict.to_dict( ) # e.g. {'Substate':['D-Trehalose'],'PM':[1]} cond_idx = subsetDf( plate_key, cond_dict).index.values # list of index values for N samples smartPrint('Fitting\n{}'.format(tidyDictPrint(cond_dict)), verbose) # get data and format for GP instance cond_data = plate_data.loc[:, list(cond_idx)] # T x N cond_data = plate_time.join(cond_data) # T x N+1 cond_data = cond_data.melt(id_vars='Time', var_name='Sample_ID', value_name='OD') cond_data = cond_data.drop( ['Sample_ID'], axis=1) # T*R x 2 (where R is number of replicates) cond_data = cond_data.dropna() gm = GrowthModel(df=cond_data, ARD=True, heteroscedastic=fix_noise, nthin=nthin) #, curve = gm.run(name=idx) # get parameter estimates using latent function diauxie_dict[idx] = curve.params.pop('df_dx') params_latent.loc[idx, :] = curve.params # get parameter estimates using samples fom the posterior distribution if posterior: params_sample.loc[idx, :] = curve.sample().posterior # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative) if args['sgd']: time = pd.DataFrame(gm.x_new, columns=['Time']) mu0, var0 = np.ravel(gm.y0), np.ravel(np.diag(gm.cov0)) mu1, var1 = np.ravel(gm.y1), np.ravel(np.diag(gm.cov1)) if fix_noise: sigma_noise = np.ravel(gm.error_new) + gm.noise else: sigma_noise = np.ravel([gm.noise] * time.shape[0]) mu_var = pd.DataFrame( [mu0, var0, mu1, var1, sigma_noise], index=['mu', 'Sigma', 'mu1', 'Sigma1', 'Noise']).T gp_data = pd.DataFrame([list(condition.values)] * len(mu0), columns=condition.keys()) gp_data = gp_data.join(time).join(mu_var) data_ls.append(gp_data) # summarize diauxie results diauxie_df = mergeDiauxieDfs(diauxie_dict) if posterior: gp_params = params_sample.join(params_latent['diauxie']) else: gp_params = params_latent # record results in object's key plate_cond = plate_cond.join(gp_params) plate_cond.index.name = 'Sample_ID' plate_cond = plate_cond.reset_index(drop=False) plate_cond = pd.merge(plate_cond, diauxie_df, on='Sample_ID') params = initParamList(0) + initParamList(1) params = list(set(params).intersection(set(plate_cond.keys()))) df_params = plate_cond.drop(initDiauxieList(), axis=1).drop_duplicates() df_diauxie = plate_cond[plate_cond.diauxie == 1] df_diauxie = df_diauxie.drop(params, axis=1) df_diauxie = minimizeDiauxieReport(df_diauxie) summ_path = assembleFullName(directory['summary'], '', filename, 'summary', '.txt') diux_path = assembleFullName(directory['summary'], '', filename, 'diauxie', '.txt') # normalize parameters, if requested df_params = normalizePooledParameters(args, df_params) df_params = df_params.drop(['Group', 'Control'], 1) df_params = minimizeParameterReport(df_params) # save results df_params.to_csv(summ_path, sep='\t', header=True, index=False) if df_diauxie.shape[0] > 0: df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False) # save latent functions if args['sgd']: file_path = assembleFullName(directory['derived'], '', filename, 'gp_data', '.txt') gp_data = pd.concat(data_ls, sort=False).reset_index(drop=True) gp_data.to_csv(file_path, sep='\t', header=True, index=True) return None
def runGrowthFitting(data, mapping, directory, args, verbose=False): ''' Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics. Args: data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1) plus-one because Time is not an index but rather a column. mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p) directory (dictionary): keys are folder names, values are their paths args (dictionary): keys are arguments and value are user/default choices verbose (boolean) Action: saves summary text file(s) in summary folder in the parent directory. saves figures (PDFs) in figures folder in the parent directory. saves data text file(s) in derived folder in the parent directory. ''' if args['pool']: runCombinedGrowthFitting(data, mapping, directory, args, verbose=verbose) return None # only store data if user requested its writing or requested plotting if args['sgd'] or args['plot'] or args['pd']: store = True else: store = False # if user requested merging of summary/data, store each plate's data/summary in temp directory first tmpdir = tempfile.mkdtemp() saved_umask = os.umask( 0o77) ## files can only be read/written by creator for security print('Temporary directory is {}\n'.format(tmpdir)) # pre-process data plate = prepDataForFitting(data, mapping, subtract_baseline=True) dx_ratio_varb = getValue('diauxie_ratio_varb') dx_ratio_min = getValue('diauxie_ratio_min') ls_temp_files = [] ls_summ_files = [] ls_diux_files = [] # for each plate, get samples and save individual text file for plate-specific summaries for pid in plate.key.Plate_ID.unique(): smartPrint('Fitting {}'.format(pid), verbose) # grab plate-specific summary sub_plate = plate.extractGrowthData(args_dict={'Plate_ID': pid}) # the primary motivation of this function: run gp model sub_plate.model(nthin=args['nthin'], store=store, verbose=verbose) # normalize parameters, if requested sub_plate.key = normalizeParameters(args, sub_plate.key) # save plots, if requested by user savePlots(sub_plate, args, directory, pid) # define file paths where data will be written if args['merges']: temp_path = assembleFullName(tmpdir, '', pid, 'gp_data', '.txt') summ_path = assembleFullName(tmpdir, '', pid, 'summary', '.txt') diux_path = assembleFullName(tmpdir, '', pid, 'diauxie', '.txt') else: temp_path = assembleFullName(directory['derived'], '', pid, 'gp_data', '.txt') summ_path = assembleFullName(directory['summary'], '', pid, 'summary', '.txt') diux_path = assembleFullName(directory['summary'], '', pid, 'diauxie', '.txt') # save data, if requested by user savePlateData(args['sgd'], sub_plate, temp_path, summ_path, diux_path) # track all potentially created files ls_temp_files.append(temp_path) ls_summ_files.append(summ_path) ls_diux_files.append(diux_path) # if user did not pass file name for output, use time stamp, see selectFileName() filename = selectFileName(args['fout']) # if user requested merging, merge all files in temporary directory mergeSummaryData(args, directory, ls_temp_files, ls_summ_files, ls_diux_files, filename) # remove temporary directory os.umask(saved_umask) os.rmdir(tmpdir) return None
def plot(self, save_path='', plot_fit=False, plot_derivative=False, plot_raw_with_fit=False): ''' Creates a 8x12 grid plot (for 96-well plate) that shows the growth curves in each well. Plot aesthetics require several parameters that are saved in config.py and pulled using functions in misc.py. Plot will be saved as a PDF to location passed via argument. Index column for object's key should be Well IDs but object's key should also have a Well column. Args: save_path (str): file path: if empty, plot will not be saved at all. plot_fit (boolean): whether to plot GP fits on top of raw OD. plot_derivative (boolean): if True, plot only the derivative of GP fit instead. Returns: fig,axes: figure and axis handles. Action: if user passes save_path argument, plot will be saved as PDF in desired location ''' sns.set_style('whitegrid') self.addLocation() time = self.time cols = [ 'Sample_ID', 'Plate_ID', 'Well', 'Row', 'Column', 'Fold_Change', 'OD_Max', 'OD_Baseline' ] key = self.key.reindex( cols, axis='columns', ) key = key.dropna(axis=1, how='all') if 'Sample_ID' in key.columns: key = key.drop_duplicates().set_index('Sample_ID') # make sure plate is 96-well version, otherwise skip plotting if not self.isSingleMultiWellPlate(): msg = 'WARNING: GrowthPlate() object for {} is not a 96-well plate. '.format( self.key.Plate_ID.iloc[0]) msg += 'AMiGA can not plot it.\n' print(msg) return None if plot_derivative: base_y = self.gp_data.pivot(columns='Sample_ID', index='Time', values='GP_Derivative') elif plot_fit: base_y = self.gp_data.pivot(columns='Sample_ID', index='Time', values='OD_Growth_Data') overlay_y = self.gp_data.pivot(columns='Sample_ID', index='Time', values='OD_Growth_Fit') elif plot_raw_with_fit: base_y = self.gp_data.pivot(columns='Sample_ID', index='Time', values='OD_Data') overlay_y = self.gp_data.pivot(columns='Sample_ID', index='Time', values='OD_Fit') else: base_y = self.data #gp_data.pivot(columns='Sample_ID',index='Time',values='OD_Data') fig, axes = plt.subplots(8, 12, figsize=[12, 8]) # define window axis limits ymax = np.ceil(base_y.max(1).max()) ymin = np.floor(base_y.min(1).min()) if plot_fit: ymin = 0 xmin = 0 xmax = time.values[-1] xmax_up = int(np.ceil(xmax)) # round up to nearest integer for well in base_y.columns: # select proper sub-plot r, c = key.loc[well, ['Row', 'Column']] - 1 ax = axes[r, c] # get colors based on fold-change and uration parameters if 'Fold_Change' in key.keys(): color_l, color_f = getPlotColors(key.loc[well, 'Fold_Change']) else: color_l = getValue('fcn_line_color') color_f = getValue('fcn_face_color') # set window axis limits ax.set_xlim([xmin, xmax]) ax.set_ylim([ymin, ymax]) # define x-data and y-data points x = np.ravel(time.values) y = base_y.loc[:, well].values # plot line and fill_betwen, if plotting OD estimate ax.plot(x, y, color=color_l, lw=1.5, zorder=10) if not plot_derivative: ax.fill_between(x=x, y1=[ax.get_ylim()[0]] * len(y), y2=y, color=color_f, zorder=7) # add fit lines, if desired if plot_fit or plot_raw_with_fit: y_fit = overlay_y.loc[:, well].values ax.plot(x, y_fit, color='yellow', alpha=0.65, ls='--', lw=1.5, zorder=10) # show tick labels for bottom left subplot only, so by default no labels if plot_derivative: plt.setp(ax, yticks=[ymin, 0, ymax], yticklabels=[] ) # zero derivative indicates no instantaneous growth else: plt.setp(ax, yticks=[ymin, ymax], yticklabels=[]) plt.setp(ax, xticks=[xmin, xmax], xticklabels=[]) # add well identifier on top left of each sub-plot well_color = getTextColors('Well_ID') ax.text(0., 1., key.loc[well, 'Well'], color=well_color, ha='left', va='top', transform=ax.transAxes) # add Max OD value on top right of each sub-plot if self.mods.floored: od_max = key.loc[well, 'OD_Max'] - key.loc[well, 'OD_Baseline'] else: od_max = key.loc[well, 'OD_Max'] ax.text(1., 1., "{0:.2f}".format(od_max), color=getTextColors('OD_Max'), ha='right', va='top', transform=ax.transAxes) # show tick labels for bottom left sub-plot only plt.setp(axes[7, 0], xticks=[0, xmax], xticklabels=[0, xmax_up]) plt.setp(axes[7, 0], yticks=[ymin, ymax], yticklabels=[ymin, ymax]) # add x- and y-labels and title ylabel_base = getValue('grid_plot_y_label') ylabel_mod = ['ln ' if self.mods.logged else ''][0] if plot_derivative: ylabel_text = 'd[ln{}]/dt'.format(ylabel_base) else: ylabel_text = ylabel_mod + ylabel_base # add labels and title fig.text(0.512, 0.07, 'Time ({})'.format(getTimeUnits('output')), fontsize=15, ha='center', va='bottom') fig.text(0.100, 0.50, ylabel_text, fontsize=15, ha='right', va='center', rotation='vertical') fig.suptitle(x=0.512, y=0.93, t=key.loc[well, 'Plate_ID'], fontsize=15, ha='center', va='center') # if no file path passed, do not save if save_path != '': plt.savefig(save_path, bbox_inches='tight') self.key.drop(['Row', 'Column'], axis=1, inplace=True) plt.close() return fig, axes
def detectDiauxie(x, y0, y1, y2, cov0, cov1, thresh, varb='K'): ''' Decompose a growth curve into individual growth phases separated by OD inflection. Args: x (numpy.ndarray): time y0 (numpy.ndarray): mean of latent function y1 (numpy.ndarray): mean of first derivative of latent function (i.e. growth rate) y2 (numpy.ndarray): mean of second dderivative of latent function (e.g. acceleration) cov0 (numpy.ndarray): covariance of latent function cov1 (numpy.ndarray): covariance of first derivatie of latent function thresh (float): ? varb (str): use either 'K' or 'r' to threshold/call secondary growth curves Retuns: ret (pandas.DataFrame): dataframe summarizes each growth phase with following: t_left: time at left bound t_right: time at right bound K: total growth r: maximum growth rate r_left: growth rate at left bound r_right: growth rate at ight bound ''' if varb == 'K': second_varb = 'r' else: second_varb = 'K' if x.ndim > 1: x = x[:, 0].ravel() # assumes time is first dimension # indices for inflections ips = list(np.where(np.diff(np.sign(y2.ravel())))[0]) if len(ips) == 0 or np.max(y0) < getValue('diauxie_k_min'): cols = ['t_left', 't_right', 'K', 'r', 'r_left', 'r_right'] ret = pd.DataFrame( [x[0], x[-1], np.max((x)), np.max(y1), y1[0][0], y1[-1][0]], index=cols) return ret.T # types of inflections its = [ np.sign(y2[ii + 1])[0] if ii < (len(y2) - 2) else -1 * np.sign(y2[ii - 1])[0] for ii in ips ] # pad edge cases ips, its = pad(ips, its, edge=1, length=len(y2)) ips, its = pad(ips, its, edge=-1, length=len(y2)) # convert data types ips = np.array([int(ii) for ii in ips]) its = np.array(its) # define bounds of each growth stage starts = np.where(its == 1)[0][:-1] stops = starts + 2 # initialize a summary dataframe and populate with bounds #ret = np.zeros((int(len(ips)/2),7)) ret = np.zeros((len(starts), 7)) ret[:, 0] = ips[starts] ret[:, 1] = ips[stops] # compute several metrics for growth stage (should I use absolute?) bounds = [(int(ii[0] + 1), int(ii[1] + 1)) for ii in ret] ret[:, 2] = [np.max((y0[l:r] - y0[l])) for l, r in bounds] # Total change in OD ret[:, 3] = [np.max(y1[l:r]) for l, r in bounds] # max growth rate, ret[:, 4:6] = [[y1[l - 1], y1[r - 1]] for l, r in bounds] # growth rate at both bounds # define attraction of each growth stage: a growth stage is attrached # to the adjacent gowth stage with the least difference in terms of # growth rate at the shared bounds (relative to max growth rate # within the bounds) ret[:, 6] = [ -1 if np.abs(row[5] - row[3]) > np.abs(row[4] - row[3]) else 1 for row in ret ] # annotate datafame and sort in ascending order cols = ['t_left', 't_right', 'K', 'r', 'r_left', 'r_right', 'attraction'] #cols = ['ind0','ind1','y_delta','max_y1','y1(ind0)','y1(ind1)','attraction'] ret = pd.DataFrame(ret, columns=cols) # how to deal with negative r or K # if at least one value is nonzero positive if any(ii > 0 for ii in ret[varb].values): # starting with the smallest growth stage (smallest total change in OD): # if it's K is smaller than a certain proportion of the max K # merge with attractor, continue until all growth phases meet criteria while ret[varb].min() < thresh * ret[varb].max(): ret = ret.sort_values(['t_left']) ret.iloc[0, -1] = 1 # first phase is always attracted forward in time ret.iloc[ -1, -1] = -1 # last phase is always attracted backward in time ret = ret.sort_values([varb, second_varb]) idx = ret.index.values[0] att = ret.loc[idx, 'attraction'] att = idx + att ret = mergePhases(ret, idx, att, varb=varb) # should you re-compute attraction? else: while ret.shape[0] > 1: # coalescale all into a single curve ret = mergePhases(ret, 0, 1) # re-sort by time and convert array indices to time values ret = ret.sort_values(['t_left']) ret.iloc[:, 0] = ret.iloc[:, 0].apply(lambda i: x[int(i)]) ret.iloc[:, 1] = ret.iloc[:, 1].apply(lambda i: x[int(i)]) ret.drop('attraction', axis=1, inplace=True) return ret
def describe(self): dx_ratio_min = getValue('diauxie_ratio_min') dx_ratio_varb = getValue('diauxie_ratio_varb') self.AreaUnderCurve() self.CarryingCapacity() self.MaxGrowthRate() self.MinGrowthRate() self.LagTime() self.StationaryDelta() params = { 'auc_lin': self.auc_lin, 'auc_log': self.auc_log, 'k_lin': self.K_lin, 'k_log': self.K_log, 't_k': self.t_K, 'gr': self.gr, 'dr': self.dr, 'td': self.td, 't_gr': self.t_gr, 't_dr': self.t_dr, 'death_lin': self.death_lin, 'death_log': self.death_log, 'lagC': self.lagC, 'lagP': self.lagP } if self.y2 is not None: dx = detectDiauxie(self.x, self.y0, self.y1, self.y2, self.cov0, self.cov1, thresh=dx_ratio_min, varb=dx_ratio_varb) # describe all phases df_dx = [] for idx, row in dx.iterrows(): t0, t1 = row['t_left'], row['t_right'] # indices t0, t1 = [np.where(self.x == ii)[0][0] for ii in [t0, t1]] # time at indices if (t0 == 0) and (t1 == (len(self.x) - 1)): dx_params = params dx_params['t0'] = row['t_left'] dx_params['tf'] = row['t_right'] df_dx.append(pd.DataFrame(dx_params, index=[idx])) else: curve = GrowthCurve(x=self.x[t0:t1], y0=self.y0[t0:t1] - self.y0[t0], y1=self.y1[t0:t1], cov0=self.cov0[t0:t1, t0:t1], cov1=self.cov1[t0:t1, t0:t1]) dx_params = curve.params dx_params['t0'] = row['t_left'] dx_params['tf'] = row['t_right'] df_dx.append(pd.DataFrame(dx_params, index=[idx])) df_dx = pd.concat(df_dx, axis=0) df_dx.columns = ['dx_{}'.format(ii) for ii in df_dx.columns] params.update({ 'diauxie': [1 if dx.shape[0] > 1 else 0][0], 'df_dx': df_dx }) self.params = params
def plotPredictions(self): ''' Visualizes the model tested by a specific hypothesis given the data. Args: x_full (pandas.DataFrame) x_min (pandas.DataFrame) hypotheis (dictionary): keys are str(H0) and str(H1), values are lists of str plate (growth.GrowthPlate obj)) variable (list): variables of interest factor_dict (dictionary): mapping of unique values of variables to numerical integers subtract_control (boolean): where control sample curves subtracted from treatment sample curves file_name (str): directory (str): path where files/figures should be stored args_dict (dictionary): must at least include 'nperm', 'nthin', and 'fdr' as keys and their values Action: saves a plot as PDF file ''' # get necessary attributs x_full = self.x_full x_min = self.x_min factor_dict = self.factor_dict hypothesis = self.hypothesis variable = self.target[0] plate = self.plate subtract_control = self.subtract_control directory = self.paths_dict['dir'] file_name = self.paths_dict['filename'] # get and modify user-accessible parameters from config.py plot_params = getHypoPlotParams() # dict tick_spacing = plot_params['tick_spacing'] legend_loc = plot_params['legend'] fontsize = plot_params['fontsize'] posterior_n = getValue('n_posterior_samples') colors = getValue('hypo_colors') # list of colors confidence = getValue('confidence') # confidence interval, e.g. 0.95 confidence = 1 - (1 - confidence) / 2 noise = self.args['noise'] if self.args['dp']: return None # grab mapping of integer codes in design matrix to actual variable labels varb_codes_map = reverseDict(factor_dict[variable]) # {codes:vlaues} cond_variables = list( set(hypothesis['H1']).difference(set( ['Time', variable]))) # conditioning variables # set figure aesthetics sns.set_style('whitegrid') rcParams['font.family'] = 'sans-serif' rcParams['font.sans-serif'] = 'Arial' # initialize grid fig, ax = plt.subplots(2, 1, figsize=[5, 10.5], sharex=False) # for each unique value of variable of interest, plot MVN prediction list_values = varb_codes_map.items() list_colors = colors[0:x_min.shape[0]] # plot MVN predictions for v_map, color in zip(list_values, list_colors): code, label = v_map criteria_real = {variable: [label]} criteria_mvn = {variable: code} ax[0] = addRealPlotLine(ax[0], plate, criteria_real, color, plot_params) ax[0] = addMVNPlotLine(ax[0], x_full, criteria_mvn, label, confidence, color, plot_params, noise) ax[0].xaxis.set_major_locator(MultipleLocator(tick_spacing)) # adjust labels and window limits ax[0] = setAxesLabels(ax[0], subtract_control, plot_params) # if variable has only 2 values and if requested, plot delta OD if (len(list_values) != 2) or (not self.args['pdo']): fig.delaxes(ax[1]) dos = None else: ax[1] = plotDeltaOD(ax[1], self.functional_diff, ylabel=True, xlabel=True, fontsize=fontsize) ax[1].xaxis.set_major_locator(MultipleLocator(tick_spacing)) ax[0].set_xlabel('') ax = dynamicWindowAdjustment(ax) ## if user did not pass file name for output, use time stamp fig_path = assemblePath(directory, file_name, '.pdf') plt.subplots_adjust(wspace=0.15, hspace=0.15) savePlotWithLegends(ax[0], fig_path, legend_loc, fontsize=fontsize)
def computeFullDifference(self): ''' Computes the full difference between two latent function (modelling growth curves). Args: x_diff (pandas.DataFrame): must include columns of Time, mu (mean of latent function), Sigma (diagonal covariance of latent function) variable (str): variable of interest, must be a column name in x_diff confidence (float [0.0,1.0]): confidence interval, e.g. 0.95 for 95%. n (int): number of samples from posterior distribution posterior (boolean), whether to sample from posterior distribution noise (boolean): whether to plot 95-pct credibel intervals including sample uncertainty Returns: df (pandas.DataFrame) delta_od_sum (float): ||OD(t)||^2 which is defined as the sum of squares for the OD when the mean and its credible interval deviates from zero. ''' x_diff = self.x_full variable = self.target[0] confidence = getValue('confidence') # confidence interval, e.g. 0.95 confidence = 1 - (1 - confidence) / 2 noise = self.args['noise'] posterior_n = getValue('n_posterior_samples') save_latent = self.args['sgd'] factor_dict = self.factor_dict def buildTestMatrix(x_time): ''' Build a test matrix to simlpify OD full difference computation. See https://github.com/ptonner/gp_growth_phenotype/testStatistic.py This is used to compare two growth latent functions. The differeence between first time points (measurements) are adjusted to zero. Args: x_time (pandas.DataFrame or pandas.Series or numpy.ndarray), ndim > 1 Returns: A (numpy.ndarray): N-1 x 2*N where N is length of time. ''' # buildtestmatrix n = x_time.shape[0] A = np.zeros((n - 1, 2 * n)) A[:, 0] = 1 A[range(n - 1), range(1, n)] = -1 A[:, n] = -1 A[range(n - 1), n + np.arange(1, n)] = 1 return A x_diff = x_diff.sort_values( [variable, 'Time']) # do you really need to sort by variable x_time = x_diff.Time.drop_duplicates() # define mean and covariance of data mu = x_diff['mu'].values if noise: Sigma = np.diag(x_diff['Sigma'] + x_diff['Noise']) else: Sigma = np.diag(x_diff['Sigma']) # define mean and covariance of functional diffeence A = buildTestMatrix(x_time) m = np.dot(A, mu) c = np.dot(A, np.dot(Sigma, A.T)) mean, std = m, np.sqrt(np.diag(c)) # sample the curve for the difference between functions, from an MVN distribution n = getValue('n_posterior_samples') samples = np.random.multivariate_normal(m, c, n) # compute the sum of functional differences for all sampled curves dos = [np.sqrt(np.sum([ii**2 for ii in s])) for s in samples] dos_mu, dos_std = np.mean(dos), np.std(dos) dos_actual = np.sqrt(np.sum([ii**2 for ii in m])) # compute the confidence interval for the sum of functional differences scaler = norm.ppf( confidence ) # define confidence interval scaler for MVN predictions ci = (dos_mu - scaler * dos_std, dos_mu + scaler * dos_std) # compute credible intervals for the curve of the difference y_avg = mean y_low = y_avg - scaler * std # y_upp = y_avg + scaler * std # package results t = x_time[1:].values df = pd.DataFrame([t, y_avg, y_low, y_upp], index=['Time', 'Avg', 'Low', 'Upp']).T self.functional_diff = df self.delta_od_sum_mean = dos_mu self.delta_od_sum_ci = ci # save gp_data fit dir_path = self.paths_dict['dir'] file_name = self.paths_dict['filename'] if save_latent: file_path = assembleFullName(dir_path, '', file_name, 'func_diff', '.txt') df.to_csv(file_path, sep='\t', header=True, index=True)
def savePredictions(self): ''' Given model predictions of growth curves (for each unique set of conditions tested), describe the latent function and its derivative in terms of growth parameters. Reports results in a file with {file_name}_params name in dir_path directory. Args: model (GPy.models.gp_regression.GPRegression) data (pandas.DataFrame) hypothesis (dictionary): e.g. {'H0':['Time'],'H1':['Time','Substrate']} actor_dict (dictionary): mapping of unique values of variables to numerical integers posterior (boolean) save_latent (boolean) dir_path (str): path to directory file_name (str): file name Returns: x_full (pandas.DataFrame): x_min (pandas.DataFrame): ''' data = self.data model = self.model hypothesis = self.hypothesis factor_dict = self.factor_dict variable = self.target[0] confidence = getValue('confidence') # confidence interval, e.g. 0.95 posterior = self.args['slf'] save_latent = self.args['sgd'] fix_noise = self.args['fn'] dir_path = self.paths_dict['dir'] file_name = self.paths_dict['filename'] # define hypothesis paraameters model_input = hypothesis['H1'] #grab minimal input data for prediction x_full = self.x_full x_min = self.x_min diauxie_dict = {} params_latent = initParamDf(x_min.index, complexity=0) params_sample = initParamDf(x_min.index, complexity=1) for idx, row in x_min.iterrows(): # get x and y data df = subsetDf(x_full.drop(['mu', 'Sigma', 'Noise'], 1), row.to_dict()) # get curve based on model predictions gm = GrowthModel(model=model.model, x_new=df.values, ARD=True) curve = gm.run() # get parameter estimates using predicted curve diauxie_dict[idx] = curve.params.pop('df_dx') params_latent.loc[idx, :] = curve.params if posterior: params_sample.loc[idx, :] = curve.sample().posterior # summarize diauxie results diauxie_df = mergeDiauxieDfs(diauxie_dict) if posterior: gp_params = params_sample.join(params_latent['diauxie']) else: gp_params = params_latent gp_params = x_min.join(gp_params) gp_params.index.name = 'Sample_ID' gp_params = gp_params.reset_index(drop=False) gp_params = pd.merge(gp_params, diauxie_df, on='Sample_ID') # save gp_data fit x_out = x_full.copy() for key, mapping in factor_dict.items(): if key in x_out.keys(): x_out.loc[:, key] = x_out.loc[:, key].replace(reverseDict(mapping)) if key in gp_params.keys(): gp_params.loc[:, key] = gp_params.loc[:, key].replace( reverseDict(mapping)) #params = initParamList(0) diauxie = initDiauxieList() params = initParamList(0) + initParamList(1) params = list(set(params).intersection(set(gp_params.keys()))) df_params = gp_params.drop(diauxie, axis=1).drop_duplicates() df_params = minimizeParameterReport(df_params) df_diauxie = gp_params[gp_params.diauxie == 1].drop(params, axis=1) df_diauxie = minimizeDiauxieReport(df_diauxie) if posterior: df_params = prettyifyParameterReport(df_params, variable, confidence) df_params = articulateParameters(df_params, axis=0) summ_path = assembleFullName(dir_path, '', file_name, 'params', '.txt') diux_path = assembleFullName(dir_path, '', file_name, 'diauxie', '.txt') #plate_cond.to_csv(file_path,sep='\t',header=True,index=True) df_params.to_csv(summ_path, sep='\t', header=True, index=posterior) if df_diauxie.shape[0] > 0: df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False) if save_latent: file_path = assembleFullName(dir_path, '', file_name, 'output', '.txt') x_out.to_csv(file_path, sep='\t', header=True, index=True)
# predict_y2 # run import warnings import numpy as np import pandas as pd from GPy.models import GPRegression from scipy.ndimage import filters from libs.kernel import buildKernel, addFixedKernel from libs.curve import GrowthCurve from libs.utils import uniqueRandomString, subsetDf, getValue if getValue('Ignore_RuntimeWarning'): warnings.filterwarnings("ignore", category=RuntimeWarning) def describeVariance(df, time='X0', od='Y'): ''' df columns ['X0','X1',...,'Y'] values of Xs except fo X0 should be non-unique ''' window = getValue('variance_smoothing_window') df = df.sort_values('Time') df.reset_index(drop=True, inplace=True) nX = len(df[time].drop_duplicates())