Example #1
0
def get_data_with_bucket_score(yzer, dirpath):
    filename = yzer.get_filename(
        dirpath, 'refseq_by_transcript_and_bucket_with_lfc.txt')
    data = yzer.import_file(filename)
    data = data.fillna(0)

    run_ids = set_up_sequencing_run_ids()
    total_tags = total_tags_per_run()
    # For each sequencing run group, fill in the bucket score val
    for replicate_id in ('', 1, 2, 3, 4):
        for run_type, id_set in run_ids.iteritems():
            rep_str = get_rep_string(replicate_id)
            curr_ids = id_set[replicate_id or 0]
            # First subset rows by sequencing run_ids of interest
            subset = data[data['sequencing_run_id'].isin(curr_ids)]
            # Group by transcript, and assign bucket score to each row
            grouped = subset.groupby('glass_transcript_id')
            bucket_scores = grouped.apply(bucket_score)
            gene_start_sums = grouped.apply(gene_start_tags)
            gene_body_sums = grouped.apply(gene_body_tags)

            # Fill in bucket score for each original row.
            data['{0}_{1}bucket_score'.format(
                run_type,
                rep_str)] = bucket_scores[data['glass_transcript_id']].values
            data['{0}_{1}gene_start_tags'.format(
                run_type,
                rep_str)] = gene_start_sums[data['glass_transcript_id']].values
            data['{0}_{1}gene_body_tags'.format(
                run_type,
                rep_str)] = gene_body_sums[data['glass_transcript_id']].values

            # Fill NAs created by transcripts missing from certain sequencing runs.
            data['{0}_{1}bucket_score'.format(run_type, rep_str)] = \
                data['{0}_{1}bucket_score'.format(run_type, rep_str)].fillna(1) # Use 1 to show parity.
            data['{0}_{1}gene_start_tags'.format(run_type, rep_str)] = \
                data['{0}_{1}gene_start_tags'.format(run_type, rep_str)].fillna(0)
            data['{0}_{1}gene_body_tags'.format(run_type, rep_str)] = \
                data['{0}_{1}gene_body_tags'.format(run_type, rep_str)].fillna(0)

        # Now calculate gene body log fold change
        kla_norm = total_tags['dmso'][replicate_id
                                      or 0] / total_tags['kla'][replicate_id
                                                                or 0]
        dex_kla_norm = total_tags['kla'][
            replicate_id or 0] / total_tags['kla_dex'][replicate_id or 0]
        data['kla_{0}gene_body_lfc'.format(rep_str)] = data.apply(
            lambda x: gene_body_lfc(x, kla_norm, rep_str, 'kla', 'dmso'),
            axis=1)
        data['dex_over_kla_{0}gene_body_lfc'.format(rep_str)] = data.apply(
            lambda x: gene_body_lfc(x, dex_kla_norm, rep_str, 'kla_dex', 'kla'
                                    ),
            axis=1)
    return data
Example #2
0
def draw_boxplot(data, label, dirpath):

    curr_dirpath = grapher.get_filename(dirpath, 'boxplots_{0}'.format(label))
    if not os.path.exists(curr_dirpath): os.mkdir(curr_dirpath)

    # Group and take mean. We can use mean because we only care
    # about the values that are the same for all rows for a given transcript.
    data = data.groupby('glass_transcript_id', as_index=False).mean()
    states = (
        ('KLA', 'kla_{0}state'),
        ('KLA+Dex', 'kla_dex_{0}state'),
        ('KLA+Dex over KLA', 'dex_over_kla_{0}state'),
    )
    for desc, state in states:

        for replicate_id in ('', 1, 2, 3, 4):
            rep_str = get_rep_string(replicate_id)
            state_str = state.format(rep_str)

            datasets = [
                ('No change', data[data[state_str] == 0]),
                ('Up >= 2x', data[data[state_str] == 1]),
                ('Down >= 2x', data[data[state_str] == -1]),
            ]

            bar_names, datasets = zip(*datasets)
            pausing_ratios = [
                d['kla_dex_{0}bucket_score'.format(rep_str)] /
                d['kla_{0}bucket_score'.format(rep_str)] for d in datasets
            ]

            ax = grapher.boxplot(
                pausing_ratios,
                bar_names,
                title='Pausing Ratio Delta in {0}, {1}'.format(
                    desc, (replicate_id and 'Group {0}'.format(replicate_id)
                           or 'Overall')),
                xlabel='State in {0} {1}'.format(desc, replicate_id),
                ylabel=
                'Pausing ratio delta: (KLA+Dex pausing ratio)/(KLA pausing ratio)',
                show_outliers=False,
                show_plot=False)

            pyplot.text(.05,
                        .9,
                        'Total transcripts: {0}'.format(len(data)),
                        transform=ax.transAxes)
            grapher.save_plot(
                grapher.get_filename(curr_dirpath,
                                     'boxplot_{0}.png'.format(state_str)))
Example #3
0
def get_tag_proportions(data, label):

    total_trans = len(set(data['glass_transcript_id']))
    print '\n\n\nTotal transcripts in {0}:\t{1}'.format(label, total_trans)

    states = (
        ('KLA', 'kla_{0}state'),
        ('KLA+Dex', 'kla_dex_{0}state'),
        ('KLA+Dex over KLA', 'dex_over_kla_{0}state'),
    )
    for desc, state in states:
        for replicate_id in ('', 1, 2, 3, 4):
            rep_str = get_rep_string(replicate_id)
            state_str = state.format(rep_str)
            if desc == 'KLA':
                datasets = [
                    ('Up > 2x in KLA, Down > 1.5x from that in Dex {0}'.format(
                        replicate_id),
                     data[(data['kla_{0}state'.format(rep_str)] == 1)
                          & (data['dex_over_kla_{0}state'.format(rep_str)] ==
                             -1)]),
                ]
            else:
                datasets = []
            datasets += [
                ('No change in {0} {1}'.format(desc, replicate_id),
                 data[data[state_str] == 0]),
                ('Up in {0} {1}'.format(desc, replicate_id),
                 data[data[state_str] == 1]),
                ('Down in {0} {1}'.format(desc, replicate_id),
                 data[data[state_str] == -1]),
            ]
            for name, dataset in datasets:
                dataset_trans = len(set(dataset['glass_transcript_id']))
                print 'Total transcripts in {0}:\t{1}'.format(
                    name, dataset_trans)
                print 'Percent of transcripts in {0}:\t{1}'.format(
                    name, dataset_trans / total_trans)
Example #4
0
def draw_elongation_profile(data,
                            grapher,
                            dirpath,
                            show_moving_average=True,
                            show_count=False):
    run_ids = set_up_sequencing_run_ids()
    total_tags = total_tags_per_run()

    states = (
        ('Special', 'group_{0}'),
        ('KLA', 'kla_{0}lfc'),
        ('KLA+Dex', 'kla_dex_{0}lfc'),
        ('KLA+Dex over KLA', 'dex_over_kla_{0}lfc'),
    )
    for desc, state in states:
        for replicate_id in ('', 1, 2, 3, 4):
            rep_str = get_rep_string(replicate_id)
            state_str = state.format(rep_str)

            # Include all transcripts at once, but only do it once.
            if desc == 'Special':
                datasets = [
                    ('All RefSeq', data),
                    ('Up > 2x in KLA, Down > 1.5x from that in Dex',
                     data[(data['kla_{0}lfc'.format(rep_str)] >= 1)
                          & (data['dex_over_kla_{0}lfc'.format(rep_str)] <=
                             -.58)]),
                ]
            else:
                datasets = [
                    ('No change in {0}'.format(desc),
                     data[data[state_str].abs() < 1]),
                    ('Up in {0}'.format(desc), data[data[state_str] >= 1]),
                    ('Down in {0}'.format(desc), data[data[state_str] <= -1]),
                ]

            for label, dataset in datasets:
                slug_label = label.lower().replace(' ', '_')
                group_by_cols = ['basepair', 'sequencing_run_id']
                data_grouped = dataset.groupby(group_by_cols,
                                               as_index=False).sum()

                groups = [
                    data_grouped[data_grouped['sequencing_run_id'].isin(
                        run_ids['dmso'][replicate_id or 0])],
                    data_grouped[data_grouped['sequencing_run_id'].isin(
                        run_ids['kla'][replicate_id or 0])],
                    data_grouped[data_grouped['sequencing_run_id'].isin(
                        run_ids['kla_dex'][replicate_id or 0])]
                ]

                # Combine for sequencing runs now
                for i, group in enumerate(groups):
                    groups[i] = group.groupby(['basepair'],
                                              as_index=False).sum()

                totals = zip(*total_tags.values())[replicate_id or 0]
                tag_scalars = grapher.get_tag_scalars(totals)
                ax = grapher.plot_tags_per_basepair(
                    groups,
                    labels=['DMSO', 'KLA', 'KLA+Dex'],
                    title='Tag localization for RefSeq: {0}, {1}'.format(
                        label,
                        replicate_id and 'Group {0}'.format(replicate_id)
                        or 'overall'),
                    tag_scalars=tag_scalars,
                    show_moving_average=show_moving_average,
                    show_count=show_count)
                grapher.save_plot(
                    grapher.get_filename(
                        dirpath, '{0}_refseq_by_run_type_{1}.png'.format(
                            slug_label,
                            replicate_id and 'group_{0}'.format(replicate_id)
                            or 'all')))
Example #5
0
    if True:
        data = grapher.import_file(
            grapher.get_filename(dirpath,
                                 'transcripts_per_run_per_bucket.txt'))
        # For the sake of graphing, imitate basepair
        data['basepair'] = (data['bucket'] - 1) * 50

        # Merge on transcripts to get DMSO tags.
        transcripts = grapher.import_file(
            grapher.get_filename(dirpath, 'transcript_vectors.txt'))
        transcripts = transcripts[transcripts['has_refseq'] == 1]
        transcripts['glass_transcript_id'] = transcripts['id']
        totals = total_tags_per_run()
        for replicate_id in ('', 1, 2, 3, 4):
            rep_str = get_rep_string(replicate_id)
            transcripts['dmso_{0}rpkm'.format(rep_str)] = transcripts['dmso_{0}tag_count'.format(rep_str)]*(10**3*10**6)\
                        /transcripts['length']/totals['dmso'][replicate_id or 0]

        transcripts = transcripts.filter(
            regex=r'glass_transcript_id|has_refseq|.*lfc|.*rpkm')

        data = data.merge(transcripts, how='left', on='glass_transcript_id')
        data = data.fillna(0)
        data = data[data['has_refseq'] == 1]

        # Create filtered groups.
        datasets = [
            ('all_refseq', data),
            ('low_basal', data[data['dmso_rpkm'] < 1]),
            ('low_basal_1', data[data['dmso_1_rpkm'] < 1]),