Beispiel #1
0
def savePlots(plate, args, directory, filename):
    '''
    Saves the GP model fits of grwoth.GrowthPlate() object as a plot. 

    Args:
        plate (growth.GrwothPlate() object)
        args (dictionary): keys are arguments and value are user/default choices
        directory (dictionary): keys are folder names, values are their paths
        filename (str): file name

    Returns:
        None
    '''

    if args['plot']:  # plot OD and its GP estimate

        fig_path = assembleFullName(directory['figures'], '', filename, 'fit',
                                    '.pdf')
        plate.plot(fig_path, plot_fit=True, plot_derivative=False)

    if args['pd']:  # plot GP estimate of dOD/dt (i.e. derivative)

        fig_path = assembleFullName(directory['figures'], '', filename,
                                    'derivative', '.pdf')
        plate.plot(fig_path, plot_fit=False, plot_derivative=True)
Beispiel #2
0
    def exportReport(self):
        def oneLineRport(**kwargs):
            return pd.DataFrame(columns=[0],
                                index=list(kwargs.keys()),
                                data=list(kwargs.values())).T

        if self.args['sc']:
            sc_msg = 'Samples were normalized to their respective control samples before analysis.'
        else:
            sc_msg = 'Samples were modeled without controlling for batch effects '
            sc_msg += '(i.e. subtracting the growth of group/batch-specific control samples).'

        nthin = len(np.unique(self.model.x[:, 0]))

        msg = 'The following criteria were used to subset data:\n'
        msg += tidyDictPrint(self.params['subset'])
        msg += '\n'
        msg += self.msg
        msg += '\nData Manipulation: Input was reduced to '
        msg += '{} equidistant time points. {}'.format(nthin, sc_msg)
        self.msg = msg

        # compact report of results
        report_args = {
            'Filename': self.paths_dict['filename'],
            'Subtract_Control': self.args['sc'],
            'Subset': self.params['subset'],
            'Hypothesis': self.params['hypo'],
            'LL0': self.LL0,
            'LL1': self.LL1,
            'Log_BF': self.log_BF,
            'FDR': self.args['fdr'],
            'M1_FDR_cutoff': self.M1_Pct_Cutoff,
            'M0_FDR_cutoff': self.M0_Pct_Cutoff,
            'Permuted_log_BF': self.log_BF_null_dist,
            'Func_Diff_Mean': self.delta_od_sum_mean,
            'Func_Diff_CI': self.delta_od_sum_ci
        }

        dir_path = self.paths_dict['dir']
        file_name = self.paths_dict['filename']

        file_path = assembleFullName(dir_path, '', file_name, 'log', '.txt')
        self.compactReport = oneLineRport(**report_args)
        self.compactReport.to_csv(file_path, sep='\t', header=True, index=None)

        # save report of data
        file_path = assembleFullName(dir_path, '', file_name, 'report', '.txt')
        fid = open(file_path, 'w')
        fid.write(self.msg)
        fid.close()
Beispiel #3
0
def get_color_legend(df,full_df,args,directory,axis='y'):

    # e.g. color by ribotype
    colorby = eval('args.color_'+axis+'_by')
    if colorby is None: return None

    # variable on axis
    variable = eval('args.'+axis+'_variable')

    # e.g. file with two columns: ribotype and color
    colorfile =eval('args.color_file_'+axis)

    # dictionary arguments passed by user
    colorscheme = checkParameterCommand(eval('args.color_scheme_'+axis))

    # can't pass both file and command-line argument
    if colorfile is not None and colorscheme is not None:
    	msg = 'WARNING: User must pass eithe color file or color scheme '
    	msg += ' for the {}-axis, not both.' .format(axis)
    	sys.exit(msg.format(axis))

    if colorfile is not None:
    	colors_df = pd.read_csv(colorfile,sep='\t',header=0,index_col=0)
    if colorscheme is not None:
    	colors_df = pd.DataFrame(colorscheme,index=['Color']).T

    # create list of colors based on meta-data
    if args.missing_color is None:
    	missing_color = generate_missing_color(list(colors_df.Color.values))
    else:
    	missing_color = args.missing_color

    if colorby == variable:
    	foo = full_df.loc[:,[variable]]
    else:
    	foo = full_df.loc[:,[colorby,variable]]

    foo = foo.drop_duplicates().set_index(variable).astype(str)
    foo = foo.join(colors_df,on=colorby)
    foo.Color = [missing_color if str(ii)=='nan' else ii for ii in list(foo.Color.values)]
    if axis == 'x': colors = foo.loc[df.columns,'Color'].values
    if axis == 'y': colors = foo.loc[df.index.values,'Color'].values

    # create legend patches
    colors_df.loc['~other~','Color'] = missing_color
    colors_df = colors_df[colors_df.Color.isin(colors)]
    colors_df = colors_df.to_dict()['Color']
    patches = [mpatches.Patch(color=color,label=label) for label,color in colors_df.items()]

    # plot legend
    fig,ax = plt.subplots(figsize=[4,4])
    ax.axis(False)
    lgd = ax.legend(handles=patches,loc='center')

    # save legend
    fpath = assembleFullName(directory,'',args.output,axis+'_legend','.pdf')
    plt.savefig(fpath,bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close()

    return colors
Beispiel #4
0
def saveDf(full_df,sub_df,args,directory):

	if not args.save_filtered_table:
		return None

	sub_df = subsetDf(full_df,{args.y_variable:list(sub_df.index.values),
		                       args.x_variable:list(sub_df.keys().values)})

	fpath = assembleFullName(directory,'',args.output,'filtered','.txt')
	sub_df.to_csv(fpath,sep='\t',header=True,index=True)
Beispiel #5
0
def mergeSummaryData(args, directory, ls_temp_files, ls_summ_files,
                     ls_diux_files, filename):
    '''
    Reads files passed via list and concatenates them into a single pandas.DataFrame. This 
        is executed for summary files and gp_data files separately. 

    Args:
        args (dictionary): keys are arguments and value are user/default choices
        directory (dictionary): keys are folder names, values are their paths
        ls_temp_files (list): where each item is a file path (str)
        ls_temp_files (list): where each item is a file path (str)
        filename (str): base file name  
    '''

    if args['sgd'] and args['merges']:

        file_path = assembleFullName(directory['derived'], '', filename,
                                     'gp_data', '.txt')
        concatFileDfs(ls_temp_files).to_csv(file_path,
                                            sep='\t',
                                            header=True,
                                            index=True)

    if args['merges']:

        summ_df = concatFileDfs(ls_summ_files)
        diux_df = concatFileDfs(ls_diux_files)

        summ_path = assembleFullName(directory['summary'], '', filename,
                                     'summary', '.txt')
        diux_path = assembleFullName(directory['summary'], '', filename,
                                     'diauxie', '.txt')

        summ_df.to_csv(summ_path, sep='\t', header=True, index=True)
        if diux_df.shape[0] > 0:
            diux_df.to_csv(diux_path, sep='\t', header=True, index=True)

        # clean-up
        for f in ls_temp_files + ls_summ_files + ls_diux_files:
            if os.path.isfile(f):
                os.remove(f)
Beispiel #6
0
def clusterMap(df, args, directory):
    def dekwarg(ii):
        key = ii.split(':')[0]
        value = ii.split(':')[1]
        if value.replace('.', '', 1).isdigit():
            value = float(value)
        elif value in ['True', 'False']:
            value = bool(value)
        return key, value

    ny, nx = df.shape
    figsize = [nx * 2 + 6, ny * 0.5 + 3]

    kwargs = {'row_cluster': False, 'col_cluster': False, 'figsize': figsize}

    if args['kwargs']:
        h_kwargs = args['kwargs'].split(';')
        h_kwargs = [dekwarg(ii) for ii in h_kwargs]
        h_kwargs = {k: v for k, v in h_kwargs}
        kwargs.update(h_kwargs)

    c = sns.clustermap(df, **kwargs)

    kwargs = {
        'xlabel': '',
        'ylabel': '',
        'title': [args['v'] if args['t'] is None else args['t']][0]
    }

    c.ax_heatmap.set(**kwargs)
    c.ax_row_dendrogram.set_visible(False)

    dendro_box = c.ax_row_dendrogram.get_position()
    dendro_box.x0 = (dendro_box.x0 + 2 * dendro_box.x1) / 3
    dendro_box.x0 = dendro_box.x0 - 0.01
    dendro_box.x1 = dendro_box.x1 - 0.01
    c.cax.set_position(dendro_box)
    c.cax.yaxis.set_ticks_position("left")

    [
        ii.set(fontsize=30) for ii in c.ax_heatmap.get_xticklabels() +
        c.ax_heatmap.get_yticklabels()
    ]
    [ii.set(rotation=90) for ii in c.ax_heatmap.get_xticklabels()]

    #cbar = ax.collections[0].colorbar
    #cbar.ax.tick_params(labelsize=20)

    #fpath = assembleFullName(args['fi'],'',args['fo'],'clustered','.pdf')
    fpath = assembleFullName(directory, '', args['fo'], '', '.pdf')
    plt.savefig(fpath, bbox_inches='tight')
Beispiel #7
0
    def initPaths(self):
        '''
        Initialize paths for for saving data and results. 
        '''

        # if user did not pass file name for output, use time stamp
        file_name = selectFileName(self.args['fout'])
        dir_path = assemblePath(self.directory['models'], file_name, '')
        if not os.path.exists(dir_path): os.mkdir(dir_path)

        # running model on transformed results and recording results
        file_path_key = assembleFullName(dir_path, '', file_name, 'key',
                                         '.txt')
        file_path_input = assembleFullName(dir_path, '', file_name, 'input',
                                           '.txt')

        paths_dict = {}

        paths_dict['filename'] = file_name
        paths_dict['dir'] = dir_path
        paths_dict['key'] = file_path_key
        paths_dict['input'] = file_path_input

        self.paths_dict = paths_dict
Beispiel #8
0
def plot(df, args, directory):
    def dekwarg(ii):
        key = ii.split(':')[0]
        value = ii.split(':')[1]
        if value.replace('.', '', 1).isdigit():
            value = float(value)
        elif value in ['True', 'False']:
            value = bool(value)
        return key, value

    ny, nx = df.shape
    figsize = [nx * 2, ny * 0.5]

    kwargs = {}

    if args['kwargs']:
        h_kwargs = args['kwargs'].split(';')
        h_kwargs = [dekwarg(ii) for ii in h_kwargs]
        h_kwargs = {k: v for k, v in h_kwargs}

        kwargs.update(h_kwargs)

    #fig,ax = plt.subplots()#figsize=[nx*4,ny*0.5])
    fig, ax = plt.subplots(figsize=figsize)

    sns.heatmap(df, ax=ax, **kwargs)

    kwargs = {
        'xlabel': '',
        'ylabel': '',
        'title': [args['v'] if args['t'] is None else args['t']][0]
    }

    ax.set(**kwargs)

    [ii.set(rotation=90) for ii in ax.get_xticklabels()]

    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=20)

    #dpath = assemblePath(args['fi'],'summary')
    #fpath = assembleFullName(args['fi'],'',args['fo'],'','.pdf')
    fpath = assembleFullName(directory, '', args['fo'], '', '.pdf')
    plt.savefig(fpath, bbox_inches='tight')
Beispiel #9
0
def basicSummaryOnly(data, mapping, directory, args, verbose=False):
    '''
    If user only requested plotting, then for  each data file, perform a basic algebraic summary
        and plot data. Once completed, exit system. Otherwise, return None.
 
    Args:
        data (dictionary): keys are plate IDs and values are pandas.DataFrames with size t x (n+1)
            where t is the number of time-points and n is number of wells (i.e. samples),
            the additional 1 is due to the explicit 'Time' column, index is uninformative.
        mapping (dictionary): keys are plate IDs and values are pandas.DataFrames with size n x (p)
            where is the number of wells (or samples) in plate, and p are the number of variables or
            parameters described in dataframe.
        directory (dictionary): keys are folder names, values are their paths
        args
        verbose (boolean)

    Returns:
        None: if only_plot_plate argument is False. 
    '''

    if not args['obs']:  # if not only_basic_summary
        return None

    print(tidyMessage('AMiGA is summarizing and plotting data files'))

    list_keys = []

    for pid, data_df in data.items():

        # define paths where summary and plot will be saved
        key_file_path = assemblePath(directory['summary'], pid, '.txt')
        key_fig_path = assemblePath(directory['figures'], pid, '.pdf')

        # grab plate-specific samples
        #   index should be well IDs but a      column Well should also exist
        #   in main.py, annotateMappings() is called which ensures the above is the case
        mapping_df = mapping[pid]
        mapping_df = resetNameIndex(mapping_df, 'Well', False)

        # grab plate-specific data
        wells = list(mapping_df.Well.values)
        data_df = data_df.loc[:, ['Time'] + wells]

        # update plate-specific data with unique Sample Identifiers
        sample_ids = list(mapping_df.index.values)
        data_df.columns = ['Time'] + sample_ids

        # create GrowthPlate object, perform basic summary
        plate = GrowthPlate(data=data_df, key=mapping_df)
        plate.convertTimeUnits(input=getTimeUnits('input'),
                               output=getTimeUnits('output'))
        plate.computeBasicSummary()
        plate.computeFoldChange(subtract_baseline=True)

        # plot and save as PDF, also save key as TXT
        if not args['dp']:
            plate.plot(key_fig_path)

        if args['merges']: list_keys.append(plate.key)
        else:
            plate.key.to_csv(key_file_path, sep='\t', header=True, index=False)

        smartPrint(pid, verbose=verbose)

    if args['merges']:
        filename = selectFileName(args['fout'])
        summary_path = assembleFullName(directory['summary'], 'summary',
                                        filename, '_basic', '.txt')
        summary_df = pd.concat(list_keys, sort=False)
        summary_df.to_csv(summary_path, sep='\t', header=True, index=False)

    smartPrint(
        '\nSee {} for summary text file(s).'.format(directory['summary']),
        verbose)
    smartPrint('See {} for figure PDF(s).\n'.format(directory['figures']),
               verbose)

    msg = 'AMiGA completed your request and '
    msg += 'wishes you good luck with the analysis!'
    print(tidyMessage(msg))

    sys.exit()
Beispiel #10
0
def runCombinedGrowthFitting(data, mapping, directory, args, verbose=False):
    '''
    Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics.
        While runGrowthFitting() analyzes data one plate at a time, runCombinedGrowthFitting()
        can pool experimental replicates across different plates. The downside is that data
        summary must be merged and no 96-well plate grid figure can be produced.  

    Args:
        data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1)
            plus-one because Time is not an index but rather a column.
        mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p)
        directory (dictionary): keys are folder names, values are their paths
        args (dictionary): keys are arguments and value are user/default choices
        verbose (boolean)

    Action:
        saves summary text file(s) in summary folder in the parent directory.
        saves figures (PDFs) in figures folder in the parent directory.
        saves data text file(s) in derived folder in the parent directory.
    '''

    # if user did not pass file name for output, use time stamp, see selectFileName()
    filename = selectFileName(args['fout'])

    # pre-process data
    plate = prepDataForFitting(data, mapping, subtract_baseline=False)

    # which meta-data variables do you use to group replicates?
    combine_keys = args['pb'].split(',')
    missing_keys = [ii for ii in combine_keys if ii not in plate.key.columns]

    if missing_keys:
        msg = 'FATAL USER ERROR: The following keys {} are '.format(
            missing_keys)
        msg += 'missing from mapping files.'
        sys.exit(msg)

    # continue processing data
    plate.subtractBaseline(to_do=True,
                           poly=getValue('PolyFit'),
                           groupby=combine_keys)
    plate_key = plate.key.copy()
    plate_data = plate.data.copy()
    plate_time = plate.time.copy()
    plate_cond = plate_key.loc[:, combine_keys +
                               ['Group', 'Control']].drop_duplicates(
                                   combine_keys).reset_index(drop=True)

    smartPrint(
        'AMiGA detected {} unique conditions.\n'.format(plate_cond.shape[0]),
        verbose)

    data_ls, diauxie_dict = [], {}

    # get user-defined values from config.py
    dx_ratio_varb = getValue('diauxie_ratio_varb')
    dx_ratio_min = getValue('diauxie_ratio_min')
    posterior_n = getValue('n_posterior_samples')
    scale = getValue('params_scale')

    posterior = args['slf']
    fix_noise = args['fn']
    nthin = args['nthin']

    # initialize empty dataframes for storing growth parameters
    params_latent = initParamDf(plate_cond.index, complexity=0)
    params_sample = initParamDf(plate_cond.index, complexity=1)

    # for each unique condition based on user request
    for idx, condition in plate_cond.iterrows():

        # get list of sample IDs
        cond_dict = condition.drop(['Group', 'Control'])
        cond_dict = cond_dict.to_dict(
        )  # e.g. {'Substate':['D-Trehalose'],'PM':[1]}
        cond_idx = subsetDf(
            plate_key,
            cond_dict).index.values  # list of index values for N samples
        smartPrint('Fitting\n{}'.format(tidyDictPrint(cond_dict)), verbose)

        # get data and format for GP instance
        cond_data = plate_data.loc[:, list(cond_idx)]  # T x N
        cond_data = plate_time.join(cond_data)  # T x N+1

        cond_data = cond_data.melt(id_vars='Time',
                                   var_name='Sample_ID',
                                   value_name='OD')
        cond_data = cond_data.drop(
            ['Sample_ID'], axis=1)  # T*R x 2 (where R is number of replicates)
        cond_data = cond_data.dropna()

        gm = GrowthModel(df=cond_data,
                         ARD=True,
                         heteroscedastic=fix_noise,
                         nthin=nthin)  #,

        curve = gm.run(name=idx)

        # get parameter estimates using latent function
        diauxie_dict[idx] = curve.params.pop('df_dx')
        params_latent.loc[idx, :] = curve.params

        # get parameter estimates using samples fom the posterior distribution
        if posterior: params_sample.loc[idx, :] = curve.sample().posterior

        # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative)
        if args['sgd']:
            time = pd.DataFrame(gm.x_new, columns=['Time'])
            mu0, var0 = np.ravel(gm.y0), np.ravel(np.diag(gm.cov0))
            mu1, var1 = np.ravel(gm.y1), np.ravel(np.diag(gm.cov1))

            if fix_noise: sigma_noise = np.ravel(gm.error_new) + gm.noise
            else: sigma_noise = np.ravel([gm.noise] * time.shape[0])

            mu_var = pd.DataFrame(
                [mu0, var0, mu1, var1, sigma_noise],
                index=['mu', 'Sigma', 'mu1', 'Sigma1', 'Noise']).T
            gp_data = pd.DataFrame([list(condition.values)] * len(mu0),
                                   columns=condition.keys())
            gp_data = gp_data.join(time).join(mu_var)
            data_ls.append(gp_data)

    # summarize diauxie results
    diauxie_df = mergeDiauxieDfs(diauxie_dict)

    if posterior: gp_params = params_sample.join(params_latent['diauxie'])
    else: gp_params = params_latent

    # record results in object's key
    plate_cond = plate_cond.join(gp_params)
    plate_cond.index.name = 'Sample_ID'
    plate_cond = plate_cond.reset_index(drop=False)
    plate_cond = pd.merge(plate_cond, diauxie_df, on='Sample_ID')

    params = initParamList(0) + initParamList(1)
    params = list(set(params).intersection(set(plate_cond.keys())))

    df_params = plate_cond.drop(initDiauxieList(), axis=1).drop_duplicates()
    df_diauxie = plate_cond[plate_cond.diauxie == 1]
    df_diauxie = df_diauxie.drop(params, axis=1)
    df_diauxie = minimizeDiauxieReport(df_diauxie)

    summ_path = assembleFullName(directory['summary'], '', filename, 'summary',
                                 '.txt')
    diux_path = assembleFullName(directory['summary'], '', filename, 'diauxie',
                                 '.txt')

    # normalize parameters, if requested
    df_params = normalizePooledParameters(args, df_params)
    df_params = df_params.drop(['Group', 'Control'], 1)
    df_params = minimizeParameterReport(df_params)

    # save results
    df_params.to_csv(summ_path, sep='\t', header=True, index=False)
    if df_diauxie.shape[0] > 0:
        df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False)

    # save latent functions
    if args['sgd']:
        file_path = assembleFullName(directory['derived'], '', filename,
                                     'gp_data', '.txt')
        gp_data = pd.concat(data_ls, sort=False).reset_index(drop=True)
        gp_data.to_csv(file_path, sep='\t', header=True, index=True)

    return None
Beispiel #11
0
def runGrowthFitting(data, mapping, directory, args, verbose=False):
    '''
    Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics.  

    Args:
        data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1)
            plus-one because Time is not an index but rather a column.
        mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p)
        directory (dictionary): keys are folder names, values are their paths
        args (dictionary): keys are arguments and value are user/default choices
        verbose (boolean)

    Action:
        saves summary text file(s) in summary folder in the parent directory.
        saves figures (PDFs) in figures folder in the parent directory.
        saves data text file(s) in derived folder in the parent directory.
    '''

    if args['pool']:
        runCombinedGrowthFitting(data,
                                 mapping,
                                 directory,
                                 args,
                                 verbose=verbose)
        return None

    # only store data if user requested its writing or requested plotting
    if args['sgd'] or args['plot'] or args['pd']: store = True
    else: store = False

    # if user requested merging of summary/data, store each plate's data/summary in temp directory first
    tmpdir = tempfile.mkdtemp()
    saved_umask = os.umask(
        0o77)  ## files can only be read/written by creator for security
    print('Temporary directory is {}\n'.format(tmpdir))

    # pre-process data
    plate = prepDataForFitting(data, mapping, subtract_baseline=True)

    dx_ratio_varb = getValue('diauxie_ratio_varb')
    dx_ratio_min = getValue('diauxie_ratio_min')

    ls_temp_files = []
    ls_summ_files = []
    ls_diux_files = []

    # for each plate, get samples and save individual text file for plate-specific summaries
    for pid in plate.key.Plate_ID.unique():

        smartPrint('Fitting {}'.format(pid), verbose)

        # grab plate-specific summary
        sub_plate = plate.extractGrowthData(args_dict={'Plate_ID': pid})

        # the primary motivation of this function: run gp model
        sub_plate.model(nthin=args['nthin'], store=store, verbose=verbose)

        # normalize parameters, if requested
        sub_plate.key = normalizeParameters(args, sub_plate.key)

        # save plots, if requested by user
        savePlots(sub_plate, args, directory, pid)

        # define file paths where data will be written
        if args['merges']:
            temp_path = assembleFullName(tmpdir, '', pid, 'gp_data', '.txt')
            summ_path = assembleFullName(tmpdir, '', pid, 'summary', '.txt')
            diux_path = assembleFullName(tmpdir, '', pid, 'diauxie', '.txt')
        else:
            temp_path = assembleFullName(directory['derived'], '', pid,
                                         'gp_data', '.txt')
            summ_path = assembleFullName(directory['summary'], '', pid,
                                         'summary', '.txt')
            diux_path = assembleFullName(directory['summary'], '', pid,
                                         'diauxie', '.txt')

        # save data, if requested by user
        savePlateData(args['sgd'], sub_plate, temp_path, summ_path, diux_path)

        # track all potentially created files
        ls_temp_files.append(temp_path)
        ls_summ_files.append(summ_path)
        ls_diux_files.append(diux_path)

    # if user did not pass file name for output, use time stamp, see selectFileName()
    filename = selectFileName(args['fout'])

    # if user requested merging, merge all files in temporary directory
    mergeSummaryData(args, directory, ls_temp_files, ls_summ_files,
                     ls_diux_files, filename)

    # remove temporary directory
    os.umask(saved_umask)
    os.rmdir(tmpdir)

    return None
Beispiel #12
0
def clusterMap(df,full_df,args,directory):

	def dekwarg(ii):
		'''Defines how to dfine types of user-defined arguments'''
		key = ii.split(':')[0]
		value = ii.split(':')[1]
		adj_value = value.replace('-','')
		adj_value = adj_value.replace('.','')
		if adj_value.isdigit():
			value = float(value)
		elif value in ['True','False']:
			value = eval(value)
		return key,value

	# define figure size (inches)
	ny,nx = df.shape

	if args.width_height is None:
		figsize=[nx*2+6,ny*0.5+3]
	else:
		w,h = args.width_height
		figsize=[float(w),float(h)]

    # package argumtns into a dictionary
	kwargs = {'row_cluster':False,'col_cluster':False,'figsize':figsize}
	if args.kwargs:
		h_kwargs = args.kwargs.split(';')
		h_kwargs = [dekwarg(ii) for ii in h_kwargs]
		h_kwargs = {k:v for k,v in h_kwargs}
		kwargs.update(h_kwargs)

	if kwargs['row_cluster'] or args.cluster_y:
		kwargs['row_cluster'] = True
	if kwargs['col_cluster'] or args.cluster_x:
		kwargs['col_cluster'] = True

	# sort heatmap, if requested
	df,kwargs = sort_heatmap(df,full_df,args,kwargs)

	# get colors for side bars
	row_colors = get_color_legend(df,full_df,args,directory,axis='y')
	col_colors = get_color_legend(df,full_df,args,directory,axis='x')

	# big finale
	c = sns.clustermap(df,**kwargs,dendrogram_ratio=float(args.colorbar_ratio),
		row_colors=row_colors,col_colors=col_colors,
		colors_ratio=(args.color_x_ratio,args.color_y_ratio),
		yticklabels=True,xticklabels=True,annot=False)

	# adjust title
	title = [args.value if args.title is None else args.title][0]
	if args.color_x_by is not None:
		pad = 40
	else:
		pad = 15
	c.ax_heatmap.set_title(title,pad=pad)

	# adjust labels
	kwargs = {'xlabel':'','ylabel':''}
	c.ax_heatmap.set(**kwargs)
	c.ax_row_dendrogram.set_visible(False)
	c.ax_col_dendrogram.set_visible(False)

	# adjust color bar position and dimensions
	dendro_box = c.ax_row_dendrogram.get_position()
	dendro_box.x0 = (dendro_box.x0 + 2 * dendro_box.x1) /3
	dendro_box.x0 = dendro_box.x0 - 0.01
	dendro_box.x1 = dendro_box.x1 - 0.01
	c.cax.set_position(dendro_box)
	c.cax.yaxis.set_ticks_position("left")

	# adjust size and rotation of tick labels
	dyanmically_size_font(c.ax_heatmap,c.cax,args,df)

	# check for proper rendering of tick labels
	msg = 'WARNING: figure size is too small and /or fontsize is too large '
	msg += 'enought to display all {}-axis labels. Pleas increase the {} '
	msg += 'argument and/or decrease the fontsize to insure that all labels '
	msg += 'are propely printed'

	yticklabels = c.ax_heatmap.get_yticklabels()
	xticklabels = c.ax_heatmap.get_xticklabels()

	if df.shape[0] != len(yticklabels): print(msg.format('y','height'))
	if df.shape[1] != len(xticklabels): print(msg.format('x','width'))

	# highlight feature
	highlight_labels = checkParameterCommand(args.highlight_labels)

	if highlight_labels is not None:
		for key,values in highlight_labels.items():
			if key == 'x': tlabels = xticklabels
			elif key == 'y': tlabels = yticklabels
			for label in values:
				matches = np.where([ii.get_text()==label for ii in tlabels])[0]
				for match in matches: tlabels[match].set(color='red',fontweight='bold')
  

	fpath = assembleFullName(directory,'',args.output,'','.pdf')
	plt.savefig(fpath,bbox_inches='tight')
Beispiel #13
0
    def computeFullDifference(self):
        '''
        Computes the full difference between two latent function (modelling growth curves).

        Args:
            x_diff (pandas.DataFrame): must include columns of Time, mu (mean of latent 
                function), Sigma (diagonal covariance of latent function)
            variable (str): variable of interest, must be a column name in x_diff
            confidence (float [0.0,1.0]): confidence interval, e.g. 0.95 for 95%.
            n (int): number of samples from posterior distribution
            posterior (boolean), whether to sample from posterior distribution
            noise (boolean): whether to plot 95-pct credibel intervals including sample uncertainty

        Returns:
            df (pandas.DataFrame)
            delta_od_sum (float): ||OD(t)||^2 which is defined as the sum of squares 
                for the OD when the mean and its credible interval deviates from zero.
        '''

        x_diff = self.x_full
        variable = self.target[0]
        confidence = getValue('confidence')  # confidence interval, e.g. 0.95
        confidence = 1 - (1 - confidence) / 2
        noise = self.args['noise']
        posterior_n = getValue('n_posterior_samples')
        save_latent = self.args['sgd']
        factor_dict = self.factor_dict

        def buildTestMatrix(x_time):
            '''
            Build a test matrix to simlpify OD full difference computation.
                See https://github.com/ptonner/gp_growth_phenotype/testStatistic.py 
                This is used to compare two growth latent functions. The differeence between
                first time points (measurements) are adjusted to zero. 
            Args:
                x_time (pandas.DataFrame or pandas.Series or numpy.ndarray), ndim > 1
            Returns:
                A (numpy.ndarray): N-1 x 2*N where N is length of time.
            '''

            # buildtestmatrix
            n = x_time.shape[0]
            A = np.zeros((n - 1, 2 * n))
            A[:, 0] = 1
            A[range(n - 1), range(1, n)] = -1
            A[:, n] = -1
            A[range(n - 1), n + np.arange(1, n)] = 1

            return A

        x_diff = x_diff.sort_values(
            [variable, 'Time'])  # do you really need to sort by variable
        x_time = x_diff.Time.drop_duplicates()

        # define mean and covariance of data
        mu = x_diff['mu'].values
        if noise: Sigma = np.diag(x_diff['Sigma'] + x_diff['Noise'])
        else: Sigma = np.diag(x_diff['Sigma'])

        # define mean and covariance of functional diffeence
        A = buildTestMatrix(x_time)
        m = np.dot(A, mu)
        c = np.dot(A, np.dot(Sigma, A.T))
        mean, std = m, np.sqrt(np.diag(c))

        # sample the curve for the difference between functions, from an MVN distribution
        n = getValue('n_posterior_samples')
        samples = np.random.multivariate_normal(m, c, n)

        # compute the sum of functional differences for all sampled curves
        dos = [np.sqrt(np.sum([ii**2 for ii in s])) for s in samples]
        dos_mu, dos_std = np.mean(dos), np.std(dos)
        dos_actual = np.sqrt(np.sum([ii**2 for ii in m]))

        # compute the confidence interval for the sum of functional differences
        scaler = norm.ppf(
            confidence
        )  # define confidence interval scaler for MVN predictions
        ci = (dos_mu - scaler * dos_std, dos_mu + scaler * dos_std)

        # compute credible intervals for the curve of the difference
        y_avg = mean
        y_low = y_avg - scaler * std  #
        y_upp = y_avg + scaler * std

        # package results
        t = x_time[1:].values
        df = pd.DataFrame([t, y_avg, y_low, y_upp],
                          index=['Time', 'Avg', 'Low', 'Upp']).T

        self.functional_diff = df
        self.delta_od_sum_mean = dos_mu
        self.delta_od_sum_ci = ci

        # save gp_data fit
        dir_path = self.paths_dict['dir']
        file_name = self.paths_dict['filename']
        if save_latent:
            file_path = assembleFullName(dir_path, '', file_name, 'func_diff',
                                         '.txt')
            df.to_csv(file_path, sep='\t', header=True, index=True)
Beispiel #14
0
    def savePredictions(self):
        '''
        Given model predictions of growth curves (for each unique set of conditions tested),
            describe the latent function and its derivative in terms of growth parameters. 
            Reports results in a file with {file_name}_params name in dir_path directory. 

        Args:
            model (GPy.models.gp_regression.GPRegression)
            data (pandas.DataFrame)
            hypothesis (dictionary): e.g. {'H0':['Time'],'H1':['Time','Substrate']}
            actor_dict (dictionary): mapping of unique values of variables to numerical integers
            posterior (boolean)
            save_latent (boolean)
            dir_path (str): path to directory
            file_name (str): file name

        Returns:
            x_full (pandas.DataFrame): 
            x_min (pandas.DataFrame):

        '''

        data = self.data
        model = self.model
        hypothesis = self.hypothesis
        factor_dict = self.factor_dict
        variable = self.target[0]
        confidence = getValue('confidence')  # confidence interval, e.g. 0.95

        posterior = self.args['slf']
        save_latent = self.args['sgd']
        fix_noise = self.args['fn']

        dir_path = self.paths_dict['dir']
        file_name = self.paths_dict['filename']

        # define hypothesis paraameters
        model_input = hypothesis['H1']  #grab minimal input data for prediction
        x_full = self.x_full
        x_min = self.x_min

        diauxie_dict = {}
        params_latent = initParamDf(x_min.index, complexity=0)
        params_sample = initParamDf(x_min.index, complexity=1)

        for idx, row in x_min.iterrows():

            # get x and y data
            df = subsetDf(x_full.drop(['mu', 'Sigma', 'Noise'], 1),
                          row.to_dict())

            # get curve based on model predictions
            gm = GrowthModel(model=model.model, x_new=df.values, ARD=True)
            curve = gm.run()

            # get parameter estimates using predicted curve
            diauxie_dict[idx] = curve.params.pop('df_dx')
            params_latent.loc[idx, :] = curve.params

            if posterior: params_sample.loc[idx, :] = curve.sample().posterior

        # summarize diauxie results
        diauxie_df = mergeDiauxieDfs(diauxie_dict)

        if posterior: gp_params = params_sample.join(params_latent['diauxie'])
        else: gp_params = params_latent

        gp_params = x_min.join(gp_params)
        gp_params.index.name = 'Sample_ID'
        gp_params = gp_params.reset_index(drop=False)
        gp_params = pd.merge(gp_params, diauxie_df, on='Sample_ID')

        # save gp_data fit
        x_out = x_full.copy()
        for key, mapping in factor_dict.items():
            if key in x_out.keys():
                x_out.loc[:,
                          key] = x_out.loc[:,
                                           key].replace(reverseDict(mapping))
            if key in gp_params.keys():
                gp_params.loc[:, key] = gp_params.loc[:, key].replace(
                    reverseDict(mapping))

        #params = initParamList(0)
        diauxie = initDiauxieList()
        params = initParamList(0) + initParamList(1)
        params = list(set(params).intersection(set(gp_params.keys())))

        df_params = gp_params.drop(diauxie, axis=1).drop_duplicates()
        df_params = minimizeParameterReport(df_params)
        df_diauxie = gp_params[gp_params.diauxie == 1].drop(params, axis=1)
        df_diauxie = minimizeDiauxieReport(df_diauxie)

        if posterior:
            df_params = prettyifyParameterReport(df_params, variable,
                                                 confidence)
            df_params = articulateParameters(df_params, axis=0)

        summ_path = assembleFullName(dir_path, '', file_name, 'params', '.txt')
        diux_path = assembleFullName(dir_path, '', file_name, 'diauxie',
                                     '.txt')

        #plate_cond.to_csv(file_path,sep='\t',header=True,index=True)
        df_params.to_csv(summ_path, sep='\t', header=True, index=posterior)
        if df_diauxie.shape[0] > 0:
            df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False)

        if save_latent:
            file_path = assembleFullName(dir_path, '', file_name, 'output',
                                         '.txt')
            x_out.to_csv(file_path, sep='\t', header=True, index=True)
Beispiel #15
0
def assembleMappings(data,
                     mapping_path,
                     meta_path=None,
                     save=False,
                     verbose=False):
    '''
    Creates a master mapping file (or dictionary ?) for all data files in the input argument.
        For each data file, in this particular order, it will first (1) check if an individual
        mapping file exists, (2) if not, check if relevant meta-data is provided in meta.txt
        file, (3) if not, infer if plate is a BIOLOG PM based on its file name, and (4) if all
        fail, create a minimalist mapping file. 

    Args:
        data (dictionary): keys are file names (i.e. filebases or Plate IDs) and values are
            pandas DataFrames where index column (row names) are well IDs.
        mapping_path (str): path to the mapping folder.
        meta_path (str): path to the mapping file.
        verbose (boolean)

    Returns:
        df_mapping_dict (dictionary): keys are file names and values are mapping files. 
    '''

    df_mapping_dict = {}

    # list all data files to be analyed
    list_filebases = data.keys()

    # list all potential mapping file paths
    list_mapping_files = [
        assemblePath(mapping_path, ii, '.txt') for ii in list_filebases
    ]

    # read meta.txt and list all plates described by it
    meta_df, meta_df_plates = checkMetaText(meta_path, verbose=verbose)

    # assemble mapping for one data file at a time
    for filebase, mapping_file_path in zip(list_filebases, list_mapping_files):

        # what are the row names from the original data file
        well_ids = data[filebase].columns[
            1:]  # this may no be A1 ... H12, but most ofen will be

        # create file path for saving derived mapping, if requested
        newfilepath = assembleFullName(mapping_path, '', filebase, '', '.map')

        # see if user provided a mapping file that corresponds to this data file (filebase)
        if os.path.exists(mapping_file_path):

            df_mapping = pd.read_csv(mapping_file_path,
                                     sep='\t',
                                     header=0,
                                     index_col=0,
                                     dtype={
                                         'Plate_ID': str,
                                         'Isolate': str
                                     })
            df_mapping = checkPlateIdColumn(
                df_mapping, filebase)  # makes sure Plate_ID is a column
            df_mapping.index = [
                ii[0] + ii[1:].lstrip('0') for ii in df_mapping.index
            ]  # strip leading zeros in well names

            smartPrint('{:.<30} Reading {}.'.format(filebase,
                                                    mapping_file_path),
                       verbose=verbose)

        # see if user described the file in meta.txt
        elif filebase in meta_df_plates:

            meta_info = meta_df[meta_df.Plate_ID == filebase]
            msg = '{:.<30} Found meta-data in meta.txt '.format(filebase)

            biolog = isBiologFromMeta(
                meta_info)  # does meta_df indicate this is a BIOLOG plate

            if biolog:
                checkBiologSize(data[filebase], filebase)
                df_mapping = expandBiologMetaData(meta_info)
                msg += '& seems to be a BIOLOG PM plate.'
                smartPrint(msg, verbose=verbose)
            else:
                df_mapping = initKeyFromMeta(meta_info, well_ids)
                msg += '& does not seem to be a BIOLOG PM plate.'
                smartPrint(msg, verbose=verbose)

        elif isBiologFromName(filebase):
            checkBiologSize(data[filebase], filebase)
            df_mapping = initBiologPlateKey(filebase)
            msg = '{:.<30} Did not find mapping file or meta-data '.format(
                filebase)
            msg += 'BUT seems to be a BIOLOG PM plate.'
            smartPrint(msg, verbose=verbose)

        else:
            df_mapping = initMappingDf(filebase, well_ids)
            msg = '{:.<30} Did not find mapping file or meta-data '.format(
                filebase)
            msg += '& does not seem to be a BIOLOG PM plate.'
            smartPrint(msg, verbose=verbose)

        df_mapping_dict[filebase] = expandMappingParams(df_mapping,
                                                        verbose=verbose)

        if save:
            df_mapping_dict[filebase].to_csv(newfilepath,
                                             sep='\t',
                                             header=True,
                                             index=True)

    #df_mapping = df_mapping.reset_index(drop=False)
    smartPrint('', verbose=verbose)

    return df_mapping_dict