def compare_column_across_conditions(column, df): df = df.copy() # Don't modify what was passed in lim_pirna_common.add_conditions_and_sample_types(df) comparison_df = pd.DataFrame(index=SAMPLE_NAMES.values(), columns=['WT mean', 'pin/pin mean', 'p-value']) for (sample_type, sample_df) in df.groupby("sample_type"): condition_gb = sample_df.groupby("condition")[column] wt = condition_gb.get_group('WT') mut = condition_gb.get_group('pin/pin') (_, p) = stats.ttest_ind(wt, mut) sample_stats = comparison_df.ix[sample_type] sample_stats['WT mean'] = wt.mean() sample_stats['pin/pin mean'] = mut.mean() sample_stats['p-value'] = p comparison_df.to_csv("%s_comparison.csv" % column.replace(' ', '_'))
def plot_sequence_lengths(description, sequence_lengths_df, min_length, max_length): graph_dir = "%s_read_length_graphs" % description sacgf_utils.mk_path(graph_dir) total_reads = sequence_lengths_df.sum(axis=1) reads_percent = sequence_lengths_df.divide(total_reads / 100.0, axis=0) max_percent = reads_percent.max().max() y_limit = np.floor(1 + max_percent / 5) * 5 # use multiples of 5 add_conditions_and_sample_types(reads_percent) for (sample_type, sample_df) in reads_percent.groupby(SAMPLE_TYPE): for (color_scheme, colors) in colour_sets.iteritems(): for plot_mean in [False, True]: mean_description = "_mean" if plot_mean else "" graph_image = "read_lengths_%s%s_%s_boxplot" % (color_scheme, mean_description, sample_type) graph_image = os.path.join(graph_dir, graph_image) plot_greyscale_boxplot(graph_image, description, sample_type, sample_df, colors, y_limit, min_length, max_length, plot_mean)
def save_df_with_condition_and_samples(filename, df): df = df.copy() # Don't modify what was passed in lim_pirna_common.add_conditions_and_sample_types(df) df.to_csv(filename)