data = data[data.filter(like='h3k4me2').max(axis=1) > min_tags]
    data = data[data['minimal_distance'] >= 1000]

    transcripts = yzer.import_file(
        yzer.get_filename(dirpath, 'transcript_vectors.txt'))
    transcripts['nearest_refseq_transcript_id'] = transcripts['id']
    data = data.merge(transcripts,
                      how='left',
                      on='nearest_refseq_transcript_id',
                      suffixes=['', '_trans'])

    data = data.fillna(0)

    total_tags = total_tags_per_run()

    data['h4k8ac_kla_dex_ratio'] = nonzero(
        data['h4k8ac_kla_dex_tag_count']) / nonzero(
            data['h4k8ac_kla_tag_count'])
    data['dmso_1_rpkm'] = data['dmso_1_tag_count_trans'] * (
        10**3 * 10**6) / data['length_trans'] / total_tags['dmso'][1]

    if False:
        # Low basal expression only
        data = data[data['dmso_1_rpkm'] > 2]
        img_dirpath = yzer.get_and_create_path(
            dirpath, 'boxplots_by_expression_high_basal',
            consistent and 'consistent' or 'rep1')
    if False:
        # Lose acetyl only
        data = data[data['h4k8ac_kla_dex_ratio'] < .75]
        img_dirpath = yzer.get_and_create_path(
            dirpath, 'boxplots_by_expression_lose_ac',
 data = data[data['kla_1_lfc'] >= 1]
 data = data[data['gr_kla_dex_tag_count'] > 0]
 data = data[data['gr_fa_kla_dex_tag_count'] == 0]
 print len(data)
 if pu_1: data = data[data['pu_1_kla_tag_count']  + data['pu_1_kla_tag_count'] > 0]
 
 
 data = data.fillna(0)
 data = ucsc_link_cleanup(data)
 
 
 colname = 'dex_over_kla_1_lfc'
 
 if pu_1:
     colname = 'pu_1_ratio'
     data[colname] = numpy.log2(nonzero(data['pu_1_kla_dex_tag_count'])/nonzero(data['pu_1_kla_tag_count']))
 
 none = (data['{0}_kla_nearby_tag_count'.format(peak_type)] + data['{0}_kla_dex_nearby_tag_count'.format(peak_type)] == 0)
 kla_gt = (data['{0}_kla_tag_count'.format(peak_type)] > ratio*data['{0}_kla_dex_nearby_tag_count'.format(peak_type)])
 kla_dex_gt = (data['{0}_kla_dex_tag_count'.format(peak_type)] > ratio*data['{0}_kla_nearby_tag_count'.format(peak_type)])
 nc = (data['{0}_kla_nearby_tag_count'.format(peak_type)] + data['{0}_kla_dex_nearby_tag_count'.format(peak_type)] > 0) \
     & (data['{0}_kla_dex_tag_count'.format(peak_type)] < ratio*data['{0}_kla_nearby_tag_count'.format(peak_type)]) \
     & (data['{0}_kla_tag_count'.format(peak_type)] < ratio*data['{0}_kla_dex_nearby_tag_count'.format(peak_type)])
 
 data['{0}_kla_to_kla_dex_ratio'.format(peak_type)] = data['{0}_kla_tag_count'.format(peak_type)]\
                                                         /data['{0}_kla_dex_nearby_tag_count'.format(peak_type)]
 
 print sum(kla_gt)
 print sum(kla_dex_gt)
 data[kla_gt].to_csv(yzer.get_filename(img_dirpath, 'enhancer_like_lose_{0}_{1}x_change_dsg_only.txt'.format(
                                                                                 peak_type, ratio)), 
Example #3
0
'''
from __future__ import division
from glasslab.dataanalysis.graphing.seq_grapher import SeqGrapher
from glasslab.utils.functions import nonzero

if __name__ == '__main__':
    yzer = SeqGrapher()
    dirpath = 'karmel/Desktop/Projects/GlassLab/Notes_and_Reports/Glass Atlas/Demo-data'
    dirpath = yzer.get_path(dirpath)

    img_dirpath = yzer.get_and_create_path(dirpath,
                                           'refseq_to_homer/large_gap_500bp')

    data = yzer.import_file(
        yzer.get_filename(dirpath, 'refseq_tag_counts_500bp.txt'))
    data['sum'] = nonzero(data['sum'].fillna(0))

    homer_data = yzer.import_file(
        yzer.get_filename(dirpath, 'RNA_GroSeq_CountsGenes.txt'))
    homer_data['sequence_identifier'] = homer_data['Gene ID']
    homer_data['homer_tag_count'] = nonzero(homer_data[
        'ThioMac-GroSeq-notx-110513/ genes (Total: 12166480.0) normFactor 0.82']
                                            .fillna(0))
    homer_data = homer_data[['sequence_identifier', 'homer_tag_count']]

    merged = data.merge(homer_data, how='inner', on='sequence_identifier')
    merged = merged.fillna(1)

    if True:
        ax = yzer.scatterplot(merged,
                              xcolname='homer_tag_count',
    kla_col = 'kla_lfc'

    tss_only = False
    img_dirpath = yzer.get_and_create_path(
        dirpath, 'interactions_by_kla_lfc', tss_only and 'genic'
        or 'all_interactions', 'lfc_2')

    # File generated in novel_me2_sites
    enhancers = yzer.import_file(
        yzer.get_filename(
            data_dirpath,
            'all_enhancers_with_me2_and_{0}interaction_stats.txt'.format(
                tss_only and 'tss_' or '')))

    for kla_timepoint in ('1h', ):
        enhancers['me2_ratio'] = nonzero(enhancers['me2_kla_6h_tag_count_2'])/\
                                    nonzero(enhancers['me2_notx_tag_count_2'])

        sets = OrderedDict()
        sets['4x GRO in KLA {0}'.format(kla_timepoint)] = enhancers[
            enhancers[kla_col] > 2]
        sets['No change GRO in KLA {0}'.format(kla_timepoint)] = enhancers[
            enhancers[kla_col].abs() <= 1]
        sets['1/4 GRO in KLA {0}'.format(kla_timepoint)] = enhancers[
            enhancers[kla_col] < -2]

        labels = [
            l + '\n(count: {0})'.format(len(v))
            for l, v in zip(sets.keys(), sets.values())
        ]
Example #5
0
    interactions = yzer.import_file(
        yzer.get_filename(
            data_dirpath,
            'transcript_pairs_enhancer_with_anything_with_me2_inc_me2_counts.txt'
        ))
    interactions = interactions[interactions['count'] > 1]
    all_transcripts = yzer.import_file(
        yzer.get_filename(data_dirpath, 'transcript_vectors.txt'))

    for me2_timepoint in ('6h', '24h'):
        me2_col = 'me2_{0}_ratio'.format(me2_timepoint)
        kla_col = 'kla_lfc'
        col_set = [me2_col + '_2', kla_col + '_2', kla_col, me2_col]

        interactions[me2_col] = numpy.log2(nonzero(interactions['me2_kla_{0}_tag_count'.format(me2_timepoint)])/\
                                            nonzero(interactions['me2_notx_tag_count']))
        interactions[me2_col + '_2'] = numpy.log2(nonzero(interactions['me2_kla_{0}_tag_count_2'.format(me2_timepoint)])/\
                                            nonzero(interactions['me2_notx_tag_count_2']))

        transcripts = all_transcripts[['id', kla_col]]

        # Associate gene id
        interactions = interactions.merge(transcripts, how='left', on='id')

        transcripts['id_2'] = transcripts['id']
        transcripts = transcripts.drop(['id'], axis=1)
        interactions = interactions.merge(transcripts,
                                          how='left',
                                          on='id_2',
                                          suffixes=['', '_2'])
Example #6
0
    th2 = yzer.import_file(
        yzer.get_filename(dirpath,
                          'th2_with_th1_{0}.txt'.format(peak))).fillna(0)

    # Filter out promoters
    th1 = th1[th1['tss_id'] == 0]
    th2 = th2[th2['tss_id'] == 0]

    # Get venn-diagram sets
    only_th1 = th1[th1['p2_id'] == 0]
    only_th2 = th2[th2['p2_id'] == 0]
    shared = th1[th1['p2_id'] != 0]
    shared_check = th2[th2['p2_id'] != 0]
    print len(only_th1), len(only_th2), len(shared), len(shared_check)

    only_th1['th1_tag_count'] = nonzero(only_th1['tag_count'])
    only_th1['th2_tag_count'] = nonzero(only_th1['p2_tag_count'])
    shared['th1_tag_count'] = nonzero(shared['tag_count'])
    shared['th2_tag_count'] = nonzero(shared['p2_tag_count'])
    only_th2['th1_tag_count'] = nonzero(only_th2['p2_tag_count'])
    only_th2['th2_tag_count'] = nonzero(only_th2['tag_count'])

    data = shared.append(only_th1, ignore_index=True)
    data = data.append(only_th2, ignore_index=True)

    if False:
        # Scatterplots of tag counts
        ax = yzer.scatterplot(
            data,
            'th1_tag_count',
            'th2_tag_count',
    interactions = yzer.import_file(
        yzer.get_filename(
            data_dirpath,
            'transcript_pairs_enhancer_with_anything_with_me2_inc_me2_counts.txt'
        ))
    interactions = interactions[interactions['count'] > 1]

    interactions = interactions.fillna(0)

    # Key on peak id, not enhancer id, which could be bidirectional
    #interactions['id_2'] = interactions['h3k4me2_id']
    interactions['hash'] = interactions.apply(
        lambda row: '{0}.{1}'.format(row['id'], row['id_2']), axis=1)

    for me2_timepoint in ('6h', '24h'):
        interactions['me2_ratio'] = nonzero(interactions['me2_kla_{0}_tag_count'.format(me2_timepoint)])/\
                                            nonzero(interactions['me2_notx_tag_count'])
        interactions['me2_ratio_2'] = nonzero(interactions['me2_kla_{0}_tag_count_2'.format(me2_timepoint)])/\
                                            nonzero(interactions['me2_notx_tag_count_2'])

        col = 'me2_ratio'
        pairs = {}
        pairs['notx'] = interactions[interactions['sequencing_run_id'] == 765]
        pairs['kla_30m'] = interactions[interactions['sequencing_run_id'] ==
                                        766]
        pairs['kla_4h'] = interactions[interactions['sequencing_run_id'] ==
                                       773]

        # Enhancer is totally new interactor in 4h KLA
        pairs['only_notx'] = pairs['notx'][
            (~pairs['notx']['id_2'].isin(pairs['kla_30m']['id_2']))
Example #8
0
    dirpath = yzer.get_path(dirpath)
    img_dirpath = yzer.get_and_create_path(dirpath, 'figures')

    data = yzer.import_file(
        yzer.get_filename(dirpath, 'ctcf_with_stat1_binding.txt')).fillna(0)
    with_stat1 = data[data['p2_tag_count'] > 0]
    without_stat1 = data[data['p2_tag_count'] == 0]

    if True:
        ax = yzer.piechart(
            [len(with_stat1), len(without_stat1)],
            ['CTCF sites with STAT1', 'CTCF sites without STAT1'],
            title='DP Thymocyte CTCF Sites with STAT1 in Th1 Cells',
            save_dir=img_dirpath,
            show_plot=True)
    data['tag_count_nonzero'] = nonzero(data['tag_count'])
    data['p2_tag_count_nonzero'] = nonzero(data['p2_tag_count'])
    ax = yzer.scatterplot(
        data,
        'tag_count_nonzero',
        'p2_tag_count_nonzero',
        xlabel='CTCF Tag Count',
        ylabel='Stat1 Tag Count',
        log=True,
        color='blue',
        title='Tags in CTCF Peaks versus Overlapping Stat1 Peaks',
        show_2x_range=False,
        show_legend=False,
        show_count=True,
        show_correlation=True,
        save_dir=img_dirpath,
Example #9
0
        # Output so that we don't have to recompute that every time.
        enhancers.to_csv(yzer.get_filename(
            data_dirpath, 'all_enhancers_with_me2_and_interaction_stats.txt'),
                         sep='\t',
                         header=True,
                         index=False)

    enhancers = yzer.import_file(
        yzer.get_filename(
            data_dirpath,
            'all_enhancers_with_me2_and_{0}interaction_stats.txt'.format(
                tss_only and 'tss_' or '')))

    col = 'me2_ratio'
    for me2_timepoint in ('6h', '24h'):
        enhancers[col] = nonzero(enhancers['me2_kla_{0}_tag_count_2'.format(me2_timepoint)])/\
                                    nonzero(enhancers['me2_notx_tag_count_2'])

        sets = OrderedDict()
        sets['2x me2 in KLA {0}'.format(me2_timepoint)] = enhancers[
            enhancers[col] > 10]
        sets['No change me2 in KLA {0}'.format(me2_timepoint)] = enhancers[
            (enhancers[col] >= .5) & (enhancers[col] <= 2)]
        sets['1/2 me2 in KLA {0}'.format(me2_timepoint)] = enhancers[
            enhancers[col] < .1]

        labels = [
            l + '\n(count: {0})'.format(len(v))
            for l, v in zip(sets.keys(), sets.values())
        ]
Example #10
0
    dp = yzer.import_file(
        yzer.get_filename(dirpath, 'dp_with_thiomac_ctcf.txt')).fillna(0)
    thio = yzer.import_file(
        yzer.get_filename(dirpath, 'thiomac_with_dp_ctcf.txt')).fillna(0)

    # Get venn-diagram sets
    only_dp = dp[dp['thiomac_ctcf_tag_count'] == 0]
    only_thio = thio[thio['dp_ctcf_tag_count'] == 0]
    shared = dp[dp['thiomac_ctcf_tag_count'] != 0]
    shared_check = thio[thio['dp_ctcf_tag_count'] != 0]
    print len(only_dp), len(only_thio), len(shared), len(shared_check)

    data = shared.append(only_dp, ignore_index=True)
    data = data.append(only_thio, ignore_index=True)

    data['dp_nonzero'] = nonzero(data['dp_ctcf_tag_count'])
    data['thio_nonzero'] = nonzero(data['thiomac_ctcf_tag_count'])
    ax = yzer.scatterplot(
        data,
        'dp_nonzero',
        'thio_nonzero',
        xlabel='DP Thymocyte CTCF Tag Count',
        ylabel='ThioMac CTCF Tag Count',
        log=True,
        color='blue',
        title='Tags in CTCF Peaks in DP Thymocytes versus ThioMacs',
        show_2x_range=False,
        show_legend=False,
        show_count=True,
        show_correlation=True,
        save_dir=img_dirpath,
Example #11
0
    dirpath = yzer.get_path(dirpath)
    img_dirpath = yzer.get_and_create_path(dirpath, 'figures')

    peak = 'p300'
    th1 = yzer.import_file(
        yzer.get_filename(dirpath,
                          'th1_with_th2_{0}.txt'.format(peak))).fillna(0)
    th2 = yzer.import_file(
        yzer.get_filename(dirpath,
                          'th2_with_th1_{0}.txt'.format(peak))).fillna(0)

    # Filter out promoters
    th1 = th1[th1['tss_id'] == 0]
    th2 = th2[th2['tss_id'] == 0]

    th1['th1_tag_count'] = nonzero(th1['tag_count'])
    th1['th2_tag_count'] = nonzero(th1['p2_tag_count'])
    th2['th1_tag_count'] = nonzero(th2['tag_count'])
    th2['th2_tag_count'] = nonzero(th2['p2_tag_count'])

    with_ctcf = th1[th1['ctcf_tag_count'] > 0]
    without_ctcf = th1[th1['ctcf_tag_count'] == 0]

    datasets = [with_ctcf, without_ctcf, th1, th1[th1['p2_tag_count'] > 0]]

    vals = [d['th2_tag_count'] / d['th1_tag_count'] for d in datasets]

    base_labels = [
        'With p300 and CTCF', 'With p300 but not CTCF', 'All in Th1',
        'All Shared', 'All in Th2'
    ]
Example #12
0
                          'th1_only_{0}_with_ctcf_motif.txt'.format(peak)))
    th2_with_ctcf_motif = yzer.import_file(
        yzer.get_filename(dirpath, 'motifs', 'th_p300_enhancers_ctcf',
                          'th2_only_{0}_with_ctcf_motif.txt'.format(peak)))
    shared_with_ctcf_motif = yzer.import_file(
        yzer.get_filename(dirpath, 'motifs', 'th_p300_enhancers_ctcf',
                          'th_shared_{0}_with_ctcf_motif.txt'.format(peak)))

    # Filter out promoters
    th1 = th1[th1['tss_id'] == 0]
    th2 = th2[th2['tss_id'] == 0]
    th1_with_ctcf_motif['id'] = th1_with_ctcf_motif['PositionID']
    th2_with_ctcf_motif['id'] = th2_with_ctcf_motif['PositionID']
    shared_with_ctcf_motif['id'] = shared_with_ctcf_motif['PositionID']

    th1['th1_tag_count'] = nonzero(th1['tag_count'])
    th1['th2_tag_count'] = nonzero(th1['p2_tag_count'])
    th2['th1_tag_count'] = nonzero(th2['tag_count'])
    th2['th2_tag_count'] = nonzero(th2['p2_tag_count'])

    with_ctcf = th1[th1['id'].isin(th1_with_ctcf_motif['id'])
                    | th1['id'].isin(shared_with_ctcf_motif['id'])]
    without_ctcf = th1[~th1['id'].isin(th1_with_ctcf_motif['id'])
                       & ~th1['id'].isin(shared_with_ctcf_motif['id'])]

    datasets = [with_ctcf, without_ctcf, th1,
                th1[th1['p2_tag_count'] > 0]]  #, th2]

    vals = [d['th2_tag_count'] / d['th1_tag_count'] for d in datasets]

    labels = [
Example #13
0
    
    dirpath = 'karmel/Desktop/Projects/GlassLab/Notes_and_Reports/NOD_BALBc/ThioMacs/Analysis_2013_02/'
    dirpath = yzer.get_path(dirpath)
    img_dirpath = yzer.get_and_create_path(dirpath, 'refseq_expression')
    data = yzer.import_file(yzer.get_filename(dirpath, 'transcript_vectors.txt'))
    
    data = data.fillna(0)
    data = yzer.normalize(data, 'nod_notx_1h_tag_count', 1.095436)
    data = yzer.normalize(data, 'nod_kla_1h_tag_count', 0.652898)
    #data = yzer.normalize(data, 'nonplated_diabetic_nod_notx_tag_count', 0.885427)
    #data = yzer.normalize(data, 'nonplated_diabetic_balb_notx_tag_count', 0.645579)
    
    data['balb_notx_1h_reads_per_base'] = data['balb_notx_1h_tag_count']/data['length']
    data['balb_kla_1h_reads_per_base'] = data['balb_kla_1h_tag_count']/data['length']
    
    data['balb_notx_1h_tag_count'] = nonzero(data['balb_notx_1h_tag_count'])
    data['nod_notx_1h_tag_count_norm'] = nonzero(data['nod_notx_1h_tag_count_norm'])
    data['balb_kla_1h_tag_count'] = nonzero(data['balb_kla_1h_tag_count'])
    data['nod_kla_1h_tag_count_norm'] = nonzero(data['nod_kla_1h_tag_count_norm'])
    
    data = data[data['transcript_score'] >= 4]
    data = data[data[['balb_notx_1h_tag_count','nod_notx_1h_tag_count_norm',
                      'balb_kla_1h_tag_count','nod_kla_1h_tag_count_norm']].max(axis=1) >= 10]
    
    refseq = yzer.get_refseq(data)
    
    # Remove low tag counts
    refseq = refseq[refseq['transcript_score'] >= 4]
    

    if False:
    data = data[data['minimal_distance'] >= 1000]

    transcripts = yzer.import_file(
        yzer.get_filename(dirpath, 'transcript_vectors.txt'))
    transcripts['nearest_refseq_transcript_id'] = transcripts['id']
    data = data.merge(transcripts,
                      how='left',
                      on='nearest_refseq_transcript_id',
                      suffixes=['', '_trans'])

    data = data.fillna(0)

    total_tags = total_tags_per_run()
    data['dmso_1_rpkm'] = data['dmso_1_tag_count_trans'] * (
        10**3 * 10**6) / data['length_trans'] / total_tags['dmso'][1]
    data['h4k8ac_kla_ratio'] = nonzero(data['h4k8ac_kla_tag_count']) / nonzero(
        data['h4k8ac_tag_count'])
    data['h4k8ac_kla_dex_ratio'] = nonzero(
        data['h4k8ac_kla_dex_tag_count']) / nonzero(
            data['h4k8ac_kla_tag_count'])

    for subgroup, suffix, dataset in (('RefSeq Transcripts', '_trans',
                                       data.groupby(
                                           by='nearest_refseq_transcript_id',
                                           as_index=False).mean()), ):

        ax = yzer.scatterplot(
            dataset[(dataset['kla_1_lfc_trans'] >= 1)],
            'dmso_1_rpkm',
            'dex_over_kla_1_lfc_trans',
            log=True,