data = grapher.import_file(filename) run_ids = set_up_sequencing_run_ids() dmso, kla, kla_dex, all_dmso, all_kla, all_kla_dex = get_sequencing_run_id_sets( ) total_tags = total_tags_per_run() # Norm sum scalars listed for all, group 1, group 2, group 3, group 4 kla_scalars = [1.223906, 1.281572, 1.118363, 1.104860, 1.503260] kla_dex_scalars = [1.182574, 1.147695, 1.248636, 1.069588, 1.388871] dex_over_kla_scalars = [1.069073, 0.967659, 1.122628, 1.008758, 0.927466] for i, scalar in enumerate(kla_scalars): data = grapher.normalize(data, 'kla_{0}tag_count'.format(get_rep_string(i)), scalar) for i, scalar in enumerate(kla_dex_scalars): data = grapher.normalize( data, 'kla_dex_{0}tag_count'.format(get_rep_string(i)), scalar) for i, scalar in enumerate(dex_over_kla_scalars): data = grapher.normalize(data, 'kla_dex_{0}tag_count'.format( get_rep_string(i)), scalar, suffix='_norm_2') refseq = data[data['has_refseq'] != 0] refseq = refseq[refseq['transcript_score'] >= 10] scatter_dirpath = grapher.get_filename(dirpath, 'scatterplots')
# Minimal ratio in KLA+Dex vs. KLA pausing try: min_ratio = float(sys.argv[1]) except IndexError: min_ratio = -1 try: thresh = int(sys.argv[3]) except IndexError: thresh = 4 grouped['relevant_sets'] = 0 rep_ids = (1, 2, 3, 4) for replicate_id in rep_ids: rep_str = get_rep_string(replicate_id) primary_min = grouped['kla_{0}gene_body_lfc'.format( rep_str)] <= min_ratio repressed_set = grouped.ix[primary_min][ 'glass_transcript_id'].values.tolist() grouped['relevant_sets'] += primary_min.apply(int) repressed.append(repressed_set) print 'Genes with rep {0} KLA LFC <= {1}:'.format( rep_str, min_ratio), len(repressed_set) print 'Repressed in at least 4 reps:', sum( grouped['relevant_sets'] >= 4)
def draw_elongation_profile(data, grapher, dirpath, show_moving_average=True, show_count=False): run_ids = set_up_sequencing_run_ids() total_tags = total_tags_per_run() lfcs = ( #('Special', 'group_{0}'), #('KLA','kla_{0}gene_body_lfc'),# ('KLA+Dex','kla_dex_{0}gene_body_lfc'), ('KLA+Dex over KLA', 'dex_over_kla_{0}gene_body_lfc'), ) for desc, lfc in lfcs: for replicate_id in ('', 1, 2, 3, 4): rep_str = get_rep_string(replicate_id) lfc_str = lfc.format(rep_str) # Include all transcripts at once, but only do it once. if desc == 'Special': datasets = [ ('All RefSeq', data), ('Up > 2x in KLA, Down > 1.5x from that in Dex', data[(data['kla_{0}gene_body_lfc'.format(rep_str)] >= 1) & (data['dex_over_kla_{0}gene_body_lfc'.format( rep_str)] <= -1)]), ] else: datasets = [ ('No change in {0}'.format(desc), data[data[lfc_str].abs() < 1]), ('Up in {0}'.format(desc), data[data[lfc_str] >= 1]), ('Down in {0}'.format(desc), data[data[lfc_str] <= -1]), ] for label, dataset in datasets: slug_label = label.lower().replace(' ', '_') group_by_cols = ['basepair', 'sequencing_run_id'] data_grouped = dataset.groupby(group_by_cols, as_index=False).sum() groups = [ data_grouped[data_grouped['sequencing_run_id'].isin( run_ids['dmso'][replicate_id or 0])], data_grouped[data_grouped['sequencing_run_id'].isin( run_ids['kla'][replicate_id or 0])], data_grouped[data_grouped['sequencing_run_id'].isin( run_ids['kla_dex'][replicate_id or 0])] ] # Combine for sequencing runs now for i, group in enumerate(groups): groups[i] = group.groupby(['basepair'], as_index=False).sum() totals = zip(*total_tags.values())[replicate_id or 0] tag_scalars = grapher.get_tag_scalars(totals) ax = grapher.plot_tags_per_basepair( groups, labels=['DMSO', 'KLA', 'KLA+Dex'], title='Tag localization for RefSeq: {0}, {1}'.format( label, replicate_id and 'Group {0}'.format(replicate_id) or 'overall'), tag_scalars=tag_scalars, show_moving_average=show_moving_average, show_count=show_count) grapher.save_plot( grapher.get_filename( dirpath, '{0}_refseq_by_run_type_{1}.png'.format( slug_label, replicate_id and 'group_{0}'.format(replicate_id) or 'all')))