def initPaths(self): ''' Initialize paths for for saving data and results. ''' # if user did not pass file name for output, use time stamp file_name = selectFileName(self.args['fout']) dir_path = assemblePath(self.directory['models'], file_name, '') if not os.path.exists(dir_path): os.mkdir(dir_path) # running model on transformed results and recording results file_path_key = assembleFullName(dir_path, '', file_name, 'key', '.txt') file_path_input = assembleFullName(dir_path, '', file_name, 'input', '.txt') paths_dict = {} paths_dict['filename'] = file_name paths_dict['dir'] = dir_path paths_dict['key'] = file_path_key paths_dict['input'] = file_path_input self.paths_dict = paths_dict
def basicSummaryOnly(data, mapping, directory, args, verbose=False): ''' If user only requested plotting, then for each data file, perform a basic algebraic summary and plot data. Once completed, exit system. Otherwise, return None. Args: data (dictionary): keys are plate IDs and values are pandas.DataFrames with size t x (n+1) where t is the number of time-points and n is number of wells (i.e. samples), the additional 1 is due to the explicit 'Time' column, index is uninformative. mapping (dictionary): keys are plate IDs and values are pandas.DataFrames with size n x (p) where is the number of wells (or samples) in plate, and p are the number of variables or parameters described in dataframe. directory (dictionary): keys are folder names, values are their paths args verbose (boolean) Returns: None: if only_plot_plate argument is False. ''' if not args['obs']: # if not only_basic_summary return None print(tidyMessage('AMiGA is summarizing and plotting data files')) list_keys = [] for pid, data_df in data.items(): # define paths where summary and plot will be saved key_file_path = assemblePath(directory['summary'], pid, '.txt') key_fig_path = assemblePath(directory['figures'], pid, '.pdf') # grab plate-specific samples # index should be well IDs but a column Well should also exist # in main.py, annotateMappings() is called which ensures the above is the case mapping_df = mapping[pid] mapping_df = resetNameIndex(mapping_df, 'Well', False) # grab plate-specific data wells = list(mapping_df.Well.values) data_df = data_df.loc[:, ['Time'] + wells] # update plate-specific data with unique Sample Identifiers sample_ids = list(mapping_df.index.values) data_df.columns = ['Time'] + sample_ids # create GrowthPlate object, perform basic summary plate = GrowthPlate(data=data_df, key=mapping_df) plate.convertTimeUnits(input=getTimeUnits('input'), output=getTimeUnits('output')) plate.computeBasicSummary() plate.computeFoldChange(subtract_baseline=True) # plot and save as PDF, also save key as TXT if not args['dp']: plate.plot(key_fig_path) if args['merges']: list_keys.append(plate.key) else: plate.key.to_csv(key_file_path, sep='\t', header=True, index=False) smartPrint(pid, verbose=verbose) if args['merges']: filename = selectFileName(args['fout']) summary_path = assembleFullName(directory['summary'], 'summary', filename, '_basic', '.txt') summary_df = pd.concat(list_keys, sort=False) summary_df.to_csv(summary_path, sep='\t', header=True, index=False) smartPrint( '\nSee {} for summary text file(s).'.format(directory['summary']), verbose) smartPrint('See {} for figure PDF(s).\n'.format(directory['figures']), verbose) msg = 'AMiGA completed your request and ' msg += 'wishes you good luck with the analysis!' print(tidyMessage(msg)) sys.exit()
def runCombinedGrowthFitting(data, mapping, directory, args, verbose=False): ''' Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics. While runGrowthFitting() analyzes data one plate at a time, runCombinedGrowthFitting() can pool experimental replicates across different plates. The downside is that data summary must be merged and no 96-well plate grid figure can be produced. Args: data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1) plus-one because Time is not an index but rather a column. mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p) directory (dictionary): keys are folder names, values are their paths args (dictionary): keys are arguments and value are user/default choices verbose (boolean) Action: saves summary text file(s) in summary folder in the parent directory. saves figures (PDFs) in figures folder in the parent directory. saves data text file(s) in derived folder in the parent directory. ''' # if user did not pass file name for output, use time stamp, see selectFileName() filename = selectFileName(args['fout']) # pre-process data plate = prepDataForFitting(data, mapping, subtract_baseline=False) # which meta-data variables do you use to group replicates? combine_keys = args['pb'].split(',') missing_keys = [ii for ii in combine_keys if ii not in plate.key.columns] if missing_keys: msg = 'FATAL USER ERROR: The following keys {} are '.format( missing_keys) msg += 'missing from mapping files.' sys.exit(msg) # continue processing data plate.subtractBaseline(to_do=True, poly=getValue('PolyFit'), groupby=combine_keys) plate_key = plate.key.copy() plate_data = plate.data.copy() plate_time = plate.time.copy() plate_cond = plate_key.loc[:, combine_keys + ['Group', 'Control']].drop_duplicates( combine_keys).reset_index(drop=True) smartPrint( 'AMiGA detected {} unique conditions.\n'.format(plate_cond.shape[0]), verbose) data_ls, diauxie_dict = [], {} # get user-defined values from config.py dx_ratio_varb = getValue('diauxie_ratio_varb') dx_ratio_min = getValue('diauxie_ratio_min') posterior_n = getValue('n_posterior_samples') scale = getValue('params_scale') posterior = args['slf'] fix_noise = args['fn'] nthin = args['nthin'] # initialize empty dataframes for storing growth parameters params_latent = initParamDf(plate_cond.index, complexity=0) params_sample = initParamDf(plate_cond.index, complexity=1) # for each unique condition based on user request for idx, condition in plate_cond.iterrows(): # get list of sample IDs cond_dict = condition.drop(['Group', 'Control']) cond_dict = cond_dict.to_dict( ) # e.g. {'Substate':['D-Trehalose'],'PM':[1]} cond_idx = subsetDf( plate_key, cond_dict).index.values # list of index values for N samples smartPrint('Fitting\n{}'.format(tidyDictPrint(cond_dict)), verbose) # get data and format for GP instance cond_data = plate_data.loc[:, list(cond_idx)] # T x N cond_data = plate_time.join(cond_data) # T x N+1 cond_data = cond_data.melt(id_vars='Time', var_name='Sample_ID', value_name='OD') cond_data = cond_data.drop( ['Sample_ID'], axis=1) # T*R x 2 (where R is number of replicates) cond_data = cond_data.dropna() gm = GrowthModel(df=cond_data, ARD=True, heteroscedastic=fix_noise, nthin=nthin) #, curve = gm.run(name=idx) # get parameter estimates using latent function diauxie_dict[idx] = curve.params.pop('df_dx') params_latent.loc[idx, :] = curve.params # get parameter estimates using samples fom the posterior distribution if posterior: params_sample.loc[idx, :] = curve.sample().posterior # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative) if args['sgd']: time = pd.DataFrame(gm.x_new, columns=['Time']) mu0, var0 = np.ravel(gm.y0), np.ravel(np.diag(gm.cov0)) mu1, var1 = np.ravel(gm.y1), np.ravel(np.diag(gm.cov1)) if fix_noise: sigma_noise = np.ravel(gm.error_new) + gm.noise else: sigma_noise = np.ravel([gm.noise] * time.shape[0]) mu_var = pd.DataFrame( [mu0, var0, mu1, var1, sigma_noise], index=['mu', 'Sigma', 'mu1', 'Sigma1', 'Noise']).T gp_data = pd.DataFrame([list(condition.values)] * len(mu0), columns=condition.keys()) gp_data = gp_data.join(time).join(mu_var) data_ls.append(gp_data) # summarize diauxie results diauxie_df = mergeDiauxieDfs(diauxie_dict) if posterior: gp_params = params_sample.join(params_latent['diauxie']) else: gp_params = params_latent # record results in object's key plate_cond = plate_cond.join(gp_params) plate_cond.index.name = 'Sample_ID' plate_cond = plate_cond.reset_index(drop=False) plate_cond = pd.merge(plate_cond, diauxie_df, on='Sample_ID') params = initParamList(0) + initParamList(1) params = list(set(params).intersection(set(plate_cond.keys()))) df_params = plate_cond.drop(initDiauxieList(), axis=1).drop_duplicates() df_diauxie = plate_cond[plate_cond.diauxie == 1] df_diauxie = df_diauxie.drop(params, axis=1) df_diauxie = minimizeDiauxieReport(df_diauxie) summ_path = assembleFullName(directory['summary'], '', filename, 'summary', '.txt') diux_path = assembleFullName(directory['summary'], '', filename, 'diauxie', '.txt') # normalize parameters, if requested df_params = normalizePooledParameters(args, df_params) df_params = df_params.drop(['Group', 'Control'], 1) df_params = minimizeParameterReport(df_params) # save results df_params.to_csv(summ_path, sep='\t', header=True, index=False) if df_diauxie.shape[0] > 0: df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False) # save latent functions if args['sgd']: file_path = assembleFullName(directory['derived'], '', filename, 'gp_data', '.txt') gp_data = pd.concat(data_ls, sort=False).reset_index(drop=True) gp_data.to_csv(file_path, sep='\t', header=True, index=True) return None
def runGrowthFitting(data, mapping, directory, args, verbose=False): ''' Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics. Args: data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1) plus-one because Time is not an index but rather a column. mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p) directory (dictionary): keys are folder names, values are their paths args (dictionary): keys are arguments and value are user/default choices verbose (boolean) Action: saves summary text file(s) in summary folder in the parent directory. saves figures (PDFs) in figures folder in the parent directory. saves data text file(s) in derived folder in the parent directory. ''' if args['pool']: runCombinedGrowthFitting(data, mapping, directory, args, verbose=verbose) return None # only store data if user requested its writing or requested plotting if args['sgd'] or args['plot'] or args['pd']: store = True else: store = False # if user requested merging of summary/data, store each plate's data/summary in temp directory first tmpdir = tempfile.mkdtemp() saved_umask = os.umask( 0o77) ## files can only be read/written by creator for security print('Temporary directory is {}\n'.format(tmpdir)) # pre-process data plate = prepDataForFitting(data, mapping, subtract_baseline=True) dx_ratio_varb = getValue('diauxie_ratio_varb') dx_ratio_min = getValue('diauxie_ratio_min') ls_temp_files = [] ls_summ_files = [] ls_diux_files = [] # for each plate, get samples and save individual text file for plate-specific summaries for pid in plate.key.Plate_ID.unique(): smartPrint('Fitting {}'.format(pid), verbose) # grab plate-specific summary sub_plate = plate.extractGrowthData(args_dict={'Plate_ID': pid}) # the primary motivation of this function: run gp model sub_plate.model(nthin=args['nthin'], store=store, verbose=verbose) # normalize parameters, if requested sub_plate.key = normalizeParameters(args, sub_plate.key) # save plots, if requested by user savePlots(sub_plate, args, directory, pid) # define file paths where data will be written if args['merges']: temp_path = assembleFullName(tmpdir, '', pid, 'gp_data', '.txt') summ_path = assembleFullName(tmpdir, '', pid, 'summary', '.txt') diux_path = assembleFullName(tmpdir, '', pid, 'diauxie', '.txt') else: temp_path = assembleFullName(directory['derived'], '', pid, 'gp_data', '.txt') summ_path = assembleFullName(directory['summary'], '', pid, 'summary', '.txt') diux_path = assembleFullName(directory['summary'], '', pid, 'diauxie', '.txt') # save data, if requested by user savePlateData(args['sgd'], sub_plate, temp_path, summ_path, diux_path) # track all potentially created files ls_temp_files.append(temp_path) ls_summ_files.append(summ_path) ls_diux_files.append(diux_path) # if user did not pass file name for output, use time stamp, see selectFileName() filename = selectFileName(args['fout']) # if user requested merging, merge all files in temporary directory mergeSummaryData(args, directory, ls_temp_files, ls_summ_files, ls_diux_files, filename) # remove temporary directory os.umask(saved_umask) os.rmdir(tmpdir) return None