Ejemplo n.º 1
0
    data = grapher.import_file(filename)

    run_ids = set_up_sequencing_run_ids()
    dmso, kla, kla_dex, all_dmso, all_kla, all_kla_dex = get_sequencing_run_id_sets(
    )
    total_tags = total_tags_per_run()

    # Norm sum scalars listed for all, group 1, group 2, group 3, group 4
    kla_scalars = [1.223906, 1.281572, 1.118363, 1.104860, 1.503260]
    kla_dex_scalars = [1.182574, 1.147695, 1.248636, 1.069588, 1.388871]
    dex_over_kla_scalars = [1.069073, 0.967659, 1.122628, 1.008758, 0.927466]

    for i, scalar in enumerate(kla_scalars):
        data = grapher.normalize(data,
                                 'kla_{0}tag_count'.format(get_rep_string(i)),
                                 scalar)
    for i, scalar in enumerate(kla_dex_scalars):
        data = grapher.normalize(
            data, 'kla_dex_{0}tag_count'.format(get_rep_string(i)), scalar)
    for i, scalar in enumerate(dex_over_kla_scalars):
        data = grapher.normalize(data,
                                 'kla_dex_{0}tag_count'.format(
                                     get_rep_string(i)),
                                 scalar,
                                 suffix='_norm_2')

    refseq = data[data['has_refseq'] != 0]
    refseq = refseq[refseq['transcript_score'] >= 10]

    scatter_dirpath = grapher.get_filename(dirpath, 'scatterplots')
Ejemplo n.º 2
0
        # Minimal ratio in KLA+Dex vs. KLA pausing
        try:
            min_ratio = float(sys.argv[1])
        except IndexError:
            min_ratio = -1
        try:
            thresh = int(sys.argv[3])
        except IndexError:
            thresh = 4

        grouped['relevant_sets'] = 0

        rep_ids = (1, 2, 3, 4)
        for replicate_id in rep_ids:
            rep_str = get_rep_string(replicate_id)

            primary_min = grouped['kla_{0}gene_body_lfc'.format(
                rep_str)] <= min_ratio

            repressed_set = grouped.ix[primary_min][
                'glass_transcript_id'].values.tolist()
            grouped['relevant_sets'] += primary_min.apply(int)

            repressed.append(repressed_set)

            print 'Genes with rep {0} KLA LFC <= {1}:'.format(
                rep_str, min_ratio), len(repressed_set)

        print 'Repressed in at least 4 reps:', sum(
            grouped['relevant_sets'] >= 4)
Ejemplo n.º 3
0
def draw_elongation_profile(data,
                            grapher,
                            dirpath,
                            show_moving_average=True,
                            show_count=False):
    run_ids = set_up_sequencing_run_ids()
    total_tags = total_tags_per_run()

    lfcs = (  #('Special', 'group_{0}'),
        #('KLA','kla_{0}gene_body_lfc'),# ('KLA+Dex','kla_dex_{0}gene_body_lfc'),
        ('KLA+Dex over KLA', 'dex_over_kla_{0}gene_body_lfc'), )
    for desc, lfc in lfcs:
        for replicate_id in ('', 1, 2, 3, 4):
            rep_str = get_rep_string(replicate_id)
            lfc_str = lfc.format(rep_str)

            # Include all transcripts at once, but only do it once.
            if desc == 'Special':
                datasets = [
                    ('All RefSeq', data),
                    ('Up > 2x in KLA, Down > 1.5x from that in Dex',
                     data[(data['kla_{0}gene_body_lfc'.format(rep_str)] >= 1)
                          & (data['dex_over_kla_{0}gene_body_lfc'.format(
                              rep_str)] <= -1)]),
                ]
            else:
                datasets = [
                    ('No change in {0}'.format(desc),
                     data[data[lfc_str].abs() < 1]),
                    ('Up in {0}'.format(desc), data[data[lfc_str] >= 1]),
                    ('Down in {0}'.format(desc), data[data[lfc_str] <= -1]),
                ]

            for label, dataset in datasets:
                slug_label = label.lower().replace(' ', '_')
                group_by_cols = ['basepair', 'sequencing_run_id']
                data_grouped = dataset.groupby(group_by_cols,
                                               as_index=False).sum()

                groups = [
                    data_grouped[data_grouped['sequencing_run_id'].isin(
                        run_ids['dmso'][replicate_id or 0])],
                    data_grouped[data_grouped['sequencing_run_id'].isin(
                        run_ids['kla'][replicate_id or 0])],
                    data_grouped[data_grouped['sequencing_run_id'].isin(
                        run_ids['kla_dex'][replicate_id or 0])]
                ]

                # Combine for sequencing runs now
                for i, group in enumerate(groups):
                    groups[i] = group.groupby(['basepair'],
                                              as_index=False).sum()

                totals = zip(*total_tags.values())[replicate_id or 0]
                tag_scalars = grapher.get_tag_scalars(totals)
                ax = grapher.plot_tags_per_basepair(
                    groups,
                    labels=['DMSO', 'KLA', 'KLA+Dex'],
                    title='Tag localization for RefSeq: {0}, {1}'.format(
                        label,
                        replicate_id and 'Group {0}'.format(replicate_id)
                        or 'overall'),
                    tag_scalars=tag_scalars,
                    show_moving_average=show_moving_average,
                    show_count=show_count)
                grapher.save_plot(
                    grapher.get_filename(
                        dirpath, '{0}_refseq_by_run_type_{1}.png'.format(
                            slug_label,
                            replicate_id and 'group_{0}'.format(replicate_id)
                            or 'all')))