data = data[data.filter(like='h3k4me2').max(axis=1) > min_tags] data = data[data['minimal_distance'] >= 1000] transcripts = yzer.import_file( yzer.get_filename(dirpath, 'transcript_vectors.txt')) transcripts['nearest_refseq_transcript_id'] = transcripts['id'] data = data.merge(transcripts, how='left', on='nearest_refseq_transcript_id', suffixes=['', '_trans']) data = data.fillna(0) total_tags = total_tags_per_run() data['h4k8ac_kla_dex_ratio'] = nonzero( data['h4k8ac_kla_dex_tag_count']) / nonzero( data['h4k8ac_kla_tag_count']) data['dmso_1_rpkm'] = data['dmso_1_tag_count_trans'] * ( 10**3 * 10**6) / data['length_trans'] / total_tags['dmso'][1] if False: # Low basal expression only data = data[data['dmso_1_rpkm'] > 2] img_dirpath = yzer.get_and_create_path( dirpath, 'boxplots_by_expression_high_basal', consistent and 'consistent' or 'rep1') if False: # Lose acetyl only data = data[data['h4k8ac_kla_dex_ratio'] < .75] img_dirpath = yzer.get_and_create_path( dirpath, 'boxplots_by_expression_lose_ac',
data = data[data['kla_1_lfc'] >= 1] data = data[data['gr_kla_dex_tag_count'] > 0] data = data[data['gr_fa_kla_dex_tag_count'] == 0] print len(data) if pu_1: data = data[data['pu_1_kla_tag_count'] + data['pu_1_kla_tag_count'] > 0] data = data.fillna(0) data = ucsc_link_cleanup(data) colname = 'dex_over_kla_1_lfc' if pu_1: colname = 'pu_1_ratio' data[colname] = numpy.log2(nonzero(data['pu_1_kla_dex_tag_count'])/nonzero(data['pu_1_kla_tag_count'])) none = (data['{0}_kla_nearby_tag_count'.format(peak_type)] + data['{0}_kla_dex_nearby_tag_count'.format(peak_type)] == 0) kla_gt = (data['{0}_kla_tag_count'.format(peak_type)] > ratio*data['{0}_kla_dex_nearby_tag_count'.format(peak_type)]) kla_dex_gt = (data['{0}_kla_dex_tag_count'.format(peak_type)] > ratio*data['{0}_kla_nearby_tag_count'.format(peak_type)]) nc = (data['{0}_kla_nearby_tag_count'.format(peak_type)] + data['{0}_kla_dex_nearby_tag_count'.format(peak_type)] > 0) \ & (data['{0}_kla_dex_tag_count'.format(peak_type)] < ratio*data['{0}_kla_nearby_tag_count'.format(peak_type)]) \ & (data['{0}_kla_tag_count'.format(peak_type)] < ratio*data['{0}_kla_dex_nearby_tag_count'.format(peak_type)]) data['{0}_kla_to_kla_dex_ratio'.format(peak_type)] = data['{0}_kla_tag_count'.format(peak_type)]\ /data['{0}_kla_dex_nearby_tag_count'.format(peak_type)] print sum(kla_gt) print sum(kla_dex_gt) data[kla_gt].to_csv(yzer.get_filename(img_dirpath, 'enhancer_like_lose_{0}_{1}x_change_dsg_only.txt'.format( peak_type, ratio)),
''' from __future__ import division from glasslab.dataanalysis.graphing.seq_grapher import SeqGrapher from glasslab.utils.functions import nonzero if __name__ == '__main__': yzer = SeqGrapher() dirpath = 'karmel/Desktop/Projects/GlassLab/Notes_and_Reports/Glass Atlas/Demo-data' dirpath = yzer.get_path(dirpath) img_dirpath = yzer.get_and_create_path(dirpath, 'refseq_to_homer/large_gap_500bp') data = yzer.import_file( yzer.get_filename(dirpath, 'refseq_tag_counts_500bp.txt')) data['sum'] = nonzero(data['sum'].fillna(0)) homer_data = yzer.import_file( yzer.get_filename(dirpath, 'RNA_GroSeq_CountsGenes.txt')) homer_data['sequence_identifier'] = homer_data['Gene ID'] homer_data['homer_tag_count'] = nonzero(homer_data[ 'ThioMac-GroSeq-notx-110513/ genes (Total: 12166480.0) normFactor 0.82'] .fillna(0)) homer_data = homer_data[['sequence_identifier', 'homer_tag_count']] merged = data.merge(homer_data, how='inner', on='sequence_identifier') merged = merged.fillna(1) if True: ax = yzer.scatterplot(merged, xcolname='homer_tag_count',
kla_col = 'kla_lfc' tss_only = False img_dirpath = yzer.get_and_create_path( dirpath, 'interactions_by_kla_lfc', tss_only and 'genic' or 'all_interactions', 'lfc_2') # File generated in novel_me2_sites enhancers = yzer.import_file( yzer.get_filename( data_dirpath, 'all_enhancers_with_me2_and_{0}interaction_stats.txt'.format( tss_only and 'tss_' or ''))) for kla_timepoint in ('1h', ): enhancers['me2_ratio'] = nonzero(enhancers['me2_kla_6h_tag_count_2'])/\ nonzero(enhancers['me2_notx_tag_count_2']) sets = OrderedDict() sets['4x GRO in KLA {0}'.format(kla_timepoint)] = enhancers[ enhancers[kla_col] > 2] sets['No change GRO in KLA {0}'.format(kla_timepoint)] = enhancers[ enhancers[kla_col].abs() <= 1] sets['1/4 GRO in KLA {0}'.format(kla_timepoint)] = enhancers[ enhancers[kla_col] < -2] labels = [ l + '\n(count: {0})'.format(len(v)) for l, v in zip(sets.keys(), sets.values()) ]
interactions = yzer.import_file( yzer.get_filename( data_dirpath, 'transcript_pairs_enhancer_with_anything_with_me2_inc_me2_counts.txt' )) interactions = interactions[interactions['count'] > 1] all_transcripts = yzer.import_file( yzer.get_filename(data_dirpath, 'transcript_vectors.txt')) for me2_timepoint in ('6h', '24h'): me2_col = 'me2_{0}_ratio'.format(me2_timepoint) kla_col = 'kla_lfc' col_set = [me2_col + '_2', kla_col + '_2', kla_col, me2_col] interactions[me2_col] = numpy.log2(nonzero(interactions['me2_kla_{0}_tag_count'.format(me2_timepoint)])/\ nonzero(interactions['me2_notx_tag_count'])) interactions[me2_col + '_2'] = numpy.log2(nonzero(interactions['me2_kla_{0}_tag_count_2'.format(me2_timepoint)])/\ nonzero(interactions['me2_notx_tag_count_2'])) transcripts = all_transcripts[['id', kla_col]] # Associate gene id interactions = interactions.merge(transcripts, how='left', on='id') transcripts['id_2'] = transcripts['id'] transcripts = transcripts.drop(['id'], axis=1) interactions = interactions.merge(transcripts, how='left', on='id_2', suffixes=['', '_2'])
th2 = yzer.import_file( yzer.get_filename(dirpath, 'th2_with_th1_{0}.txt'.format(peak))).fillna(0) # Filter out promoters th1 = th1[th1['tss_id'] == 0] th2 = th2[th2['tss_id'] == 0] # Get venn-diagram sets only_th1 = th1[th1['p2_id'] == 0] only_th2 = th2[th2['p2_id'] == 0] shared = th1[th1['p2_id'] != 0] shared_check = th2[th2['p2_id'] != 0] print len(only_th1), len(only_th2), len(shared), len(shared_check) only_th1['th1_tag_count'] = nonzero(only_th1['tag_count']) only_th1['th2_tag_count'] = nonzero(only_th1['p2_tag_count']) shared['th1_tag_count'] = nonzero(shared['tag_count']) shared['th2_tag_count'] = nonzero(shared['p2_tag_count']) only_th2['th1_tag_count'] = nonzero(only_th2['p2_tag_count']) only_th2['th2_tag_count'] = nonzero(only_th2['tag_count']) data = shared.append(only_th1, ignore_index=True) data = data.append(only_th2, ignore_index=True) if False: # Scatterplots of tag counts ax = yzer.scatterplot( data, 'th1_tag_count', 'th2_tag_count',
interactions = yzer.import_file( yzer.get_filename( data_dirpath, 'transcript_pairs_enhancer_with_anything_with_me2_inc_me2_counts.txt' )) interactions = interactions[interactions['count'] > 1] interactions = interactions.fillna(0) # Key on peak id, not enhancer id, which could be bidirectional #interactions['id_2'] = interactions['h3k4me2_id'] interactions['hash'] = interactions.apply( lambda row: '{0}.{1}'.format(row['id'], row['id_2']), axis=1) for me2_timepoint in ('6h', '24h'): interactions['me2_ratio'] = nonzero(interactions['me2_kla_{0}_tag_count'.format(me2_timepoint)])/\ nonzero(interactions['me2_notx_tag_count']) interactions['me2_ratio_2'] = nonzero(interactions['me2_kla_{0}_tag_count_2'.format(me2_timepoint)])/\ nonzero(interactions['me2_notx_tag_count_2']) col = 'me2_ratio' pairs = {} pairs['notx'] = interactions[interactions['sequencing_run_id'] == 765] pairs['kla_30m'] = interactions[interactions['sequencing_run_id'] == 766] pairs['kla_4h'] = interactions[interactions['sequencing_run_id'] == 773] # Enhancer is totally new interactor in 4h KLA pairs['only_notx'] = pairs['notx'][ (~pairs['notx']['id_2'].isin(pairs['kla_30m']['id_2']))
dirpath = yzer.get_path(dirpath) img_dirpath = yzer.get_and_create_path(dirpath, 'figures') data = yzer.import_file( yzer.get_filename(dirpath, 'ctcf_with_stat1_binding.txt')).fillna(0) with_stat1 = data[data['p2_tag_count'] > 0] without_stat1 = data[data['p2_tag_count'] == 0] if True: ax = yzer.piechart( [len(with_stat1), len(without_stat1)], ['CTCF sites with STAT1', 'CTCF sites without STAT1'], title='DP Thymocyte CTCF Sites with STAT1 in Th1 Cells', save_dir=img_dirpath, show_plot=True) data['tag_count_nonzero'] = nonzero(data['tag_count']) data['p2_tag_count_nonzero'] = nonzero(data['p2_tag_count']) ax = yzer.scatterplot( data, 'tag_count_nonzero', 'p2_tag_count_nonzero', xlabel='CTCF Tag Count', ylabel='Stat1 Tag Count', log=True, color='blue', title='Tags in CTCF Peaks versus Overlapping Stat1 Peaks', show_2x_range=False, show_legend=False, show_count=True, show_correlation=True, save_dir=img_dirpath,
# Output so that we don't have to recompute that every time. enhancers.to_csv(yzer.get_filename( data_dirpath, 'all_enhancers_with_me2_and_interaction_stats.txt'), sep='\t', header=True, index=False) enhancers = yzer.import_file( yzer.get_filename( data_dirpath, 'all_enhancers_with_me2_and_{0}interaction_stats.txt'.format( tss_only and 'tss_' or ''))) col = 'me2_ratio' for me2_timepoint in ('6h', '24h'): enhancers[col] = nonzero(enhancers['me2_kla_{0}_tag_count_2'.format(me2_timepoint)])/\ nonzero(enhancers['me2_notx_tag_count_2']) sets = OrderedDict() sets['2x me2 in KLA {0}'.format(me2_timepoint)] = enhancers[ enhancers[col] > 10] sets['No change me2 in KLA {0}'.format(me2_timepoint)] = enhancers[ (enhancers[col] >= .5) & (enhancers[col] <= 2)] sets['1/2 me2 in KLA {0}'.format(me2_timepoint)] = enhancers[ enhancers[col] < .1] labels = [ l + '\n(count: {0})'.format(len(v)) for l, v in zip(sets.keys(), sets.values()) ]
dp = yzer.import_file( yzer.get_filename(dirpath, 'dp_with_thiomac_ctcf.txt')).fillna(0) thio = yzer.import_file( yzer.get_filename(dirpath, 'thiomac_with_dp_ctcf.txt')).fillna(0) # Get venn-diagram sets only_dp = dp[dp['thiomac_ctcf_tag_count'] == 0] only_thio = thio[thio['dp_ctcf_tag_count'] == 0] shared = dp[dp['thiomac_ctcf_tag_count'] != 0] shared_check = thio[thio['dp_ctcf_tag_count'] != 0] print len(only_dp), len(only_thio), len(shared), len(shared_check) data = shared.append(only_dp, ignore_index=True) data = data.append(only_thio, ignore_index=True) data['dp_nonzero'] = nonzero(data['dp_ctcf_tag_count']) data['thio_nonzero'] = nonzero(data['thiomac_ctcf_tag_count']) ax = yzer.scatterplot( data, 'dp_nonzero', 'thio_nonzero', xlabel='DP Thymocyte CTCF Tag Count', ylabel='ThioMac CTCF Tag Count', log=True, color='blue', title='Tags in CTCF Peaks in DP Thymocytes versus ThioMacs', show_2x_range=False, show_legend=False, show_count=True, show_correlation=True, save_dir=img_dirpath,
dirpath = yzer.get_path(dirpath) img_dirpath = yzer.get_and_create_path(dirpath, 'figures') peak = 'p300' th1 = yzer.import_file( yzer.get_filename(dirpath, 'th1_with_th2_{0}.txt'.format(peak))).fillna(0) th2 = yzer.import_file( yzer.get_filename(dirpath, 'th2_with_th1_{0}.txt'.format(peak))).fillna(0) # Filter out promoters th1 = th1[th1['tss_id'] == 0] th2 = th2[th2['tss_id'] == 0] th1['th1_tag_count'] = nonzero(th1['tag_count']) th1['th2_tag_count'] = nonzero(th1['p2_tag_count']) th2['th1_tag_count'] = nonzero(th2['tag_count']) th2['th2_tag_count'] = nonzero(th2['p2_tag_count']) with_ctcf = th1[th1['ctcf_tag_count'] > 0] without_ctcf = th1[th1['ctcf_tag_count'] == 0] datasets = [with_ctcf, without_ctcf, th1, th1[th1['p2_tag_count'] > 0]] vals = [d['th2_tag_count'] / d['th1_tag_count'] for d in datasets] base_labels = [ 'With p300 and CTCF', 'With p300 but not CTCF', 'All in Th1', 'All Shared', 'All in Th2' ]
'th1_only_{0}_with_ctcf_motif.txt'.format(peak))) th2_with_ctcf_motif = yzer.import_file( yzer.get_filename(dirpath, 'motifs', 'th_p300_enhancers_ctcf', 'th2_only_{0}_with_ctcf_motif.txt'.format(peak))) shared_with_ctcf_motif = yzer.import_file( yzer.get_filename(dirpath, 'motifs', 'th_p300_enhancers_ctcf', 'th_shared_{0}_with_ctcf_motif.txt'.format(peak))) # Filter out promoters th1 = th1[th1['tss_id'] == 0] th2 = th2[th2['tss_id'] == 0] th1_with_ctcf_motif['id'] = th1_with_ctcf_motif['PositionID'] th2_with_ctcf_motif['id'] = th2_with_ctcf_motif['PositionID'] shared_with_ctcf_motif['id'] = shared_with_ctcf_motif['PositionID'] th1['th1_tag_count'] = nonzero(th1['tag_count']) th1['th2_tag_count'] = nonzero(th1['p2_tag_count']) th2['th1_tag_count'] = nonzero(th2['tag_count']) th2['th2_tag_count'] = nonzero(th2['p2_tag_count']) with_ctcf = th1[th1['id'].isin(th1_with_ctcf_motif['id']) | th1['id'].isin(shared_with_ctcf_motif['id'])] without_ctcf = th1[~th1['id'].isin(th1_with_ctcf_motif['id']) & ~th1['id'].isin(shared_with_ctcf_motif['id'])] datasets = [with_ctcf, without_ctcf, th1, th1[th1['p2_tag_count'] > 0]] #, th2] vals = [d['th2_tag_count'] / d['th1_tag_count'] for d in datasets] labels = [
dirpath = 'karmel/Desktop/Projects/GlassLab/Notes_and_Reports/NOD_BALBc/ThioMacs/Analysis_2013_02/' dirpath = yzer.get_path(dirpath) img_dirpath = yzer.get_and_create_path(dirpath, 'refseq_expression') data = yzer.import_file(yzer.get_filename(dirpath, 'transcript_vectors.txt')) data = data.fillna(0) data = yzer.normalize(data, 'nod_notx_1h_tag_count', 1.095436) data = yzer.normalize(data, 'nod_kla_1h_tag_count', 0.652898) #data = yzer.normalize(data, 'nonplated_diabetic_nod_notx_tag_count', 0.885427) #data = yzer.normalize(data, 'nonplated_diabetic_balb_notx_tag_count', 0.645579) data['balb_notx_1h_reads_per_base'] = data['balb_notx_1h_tag_count']/data['length'] data['balb_kla_1h_reads_per_base'] = data['balb_kla_1h_tag_count']/data['length'] data['balb_notx_1h_tag_count'] = nonzero(data['balb_notx_1h_tag_count']) data['nod_notx_1h_tag_count_norm'] = nonzero(data['nod_notx_1h_tag_count_norm']) data['balb_kla_1h_tag_count'] = nonzero(data['balb_kla_1h_tag_count']) data['nod_kla_1h_tag_count_norm'] = nonzero(data['nod_kla_1h_tag_count_norm']) data = data[data['transcript_score'] >= 4] data = data[data[['balb_notx_1h_tag_count','nod_notx_1h_tag_count_norm', 'balb_kla_1h_tag_count','nod_kla_1h_tag_count_norm']].max(axis=1) >= 10] refseq = yzer.get_refseq(data) # Remove low tag counts refseq = refseq[refseq['transcript_score'] >= 4] if False:
data = data[data['minimal_distance'] >= 1000] transcripts = yzer.import_file( yzer.get_filename(dirpath, 'transcript_vectors.txt')) transcripts['nearest_refseq_transcript_id'] = transcripts['id'] data = data.merge(transcripts, how='left', on='nearest_refseq_transcript_id', suffixes=['', '_trans']) data = data.fillna(0) total_tags = total_tags_per_run() data['dmso_1_rpkm'] = data['dmso_1_tag_count_trans'] * ( 10**3 * 10**6) / data['length_trans'] / total_tags['dmso'][1] data['h4k8ac_kla_ratio'] = nonzero(data['h4k8ac_kla_tag_count']) / nonzero( data['h4k8ac_tag_count']) data['h4k8ac_kla_dex_ratio'] = nonzero( data['h4k8ac_kla_dex_tag_count']) / nonzero( data['h4k8ac_kla_tag_count']) for subgroup, suffix, dataset in (('RefSeq Transcripts', '_trans', data.groupby( by='nearest_refseq_transcript_id', as_index=False).mean()), ): ax = yzer.scatterplot( dataset[(dataset['kla_1_lfc_trans'] >= 1)], 'dmso_1_rpkm', 'dex_over_kla_1_lfc_trans', log=True,