コード例 #1
0
    def load_data(self):
        save_dir = '%s/tf_analysis' % OUTPUT_DIR

        self.all_peaks = pd.read_csv('%s/all_peaks.csv' % save_dir)\
            .set_index('name')
        self.linked_peaks = read_orfs_data(
            '%s/linked_peaks.csv' % save_dir, 'name')
        self.linked_peaks_normalized = read_orfs_data(
            '%s/linked_peaks_norm.csv' % save_dir, 'name')
        self.prom_peaks = pd.read_csv('%s/prom_peaks.csv' % save_dir)\
            .set_index('Unnamed: 0')
        self.all_motifs = pd.read_csv('%s/all_motifs.csv' % save_dir)\
            .set_index('Unnamed: 0')

        # rename tf motifs to match gene names
        data = self.all_motifs

        rename = {
            'RCS1': 'AFT1',
            'YML081W': 'TDA9'
        }

        for k, v in rename.items():
            selected = data.tf == k
            data.loc[selected, 'tf'] = v

        self.summarize_tfs()
コード例 #2
0
def load_p123(strand_name):
    from config import mnase_dir
    from src.datasets import read_orfs_data
    from src.transformations import difference

    p1_positions = read_orfs_data('%s/p1_%s.csv' % (mnase_dir, strand_name))
    p2_positions = read_orfs_data('%s/p2_%s.csv' % (mnase_dir, strand_name))
    p3_positions = read_orfs_data('%s/p3_%s.csv' % (mnase_dir, strand_name))

    p12 = p1_positions.join(p2_positions, lsuffix='_+1', rsuffix='_+2')
    p23 = p2_positions.join(p3_positions, lsuffix='_+2', rsuffix='_+3')

    valid_12_orfs = validate_pair(p12, '+1', '+2')
    valid_23_orfs = validate_pair(p23, '+2', '+3')

    valid_orfs = list(set(valid_12_orfs).intersection(set(valid_23_orfs)))

    p1_shift = difference(p1_positions.loc[valid_orfs])
    p2_shift = difference(p2_positions.loc[valid_orfs])
    p3_shift = difference(p3_positions.loc[valid_orfs])

    p1 = p1_positions.loc[valid_orfs]
    p2 = p2_positions.loc[valid_orfs]
    p3 = p3_positions.loc[valid_orfs]

    return p1, p2, p3, p1_shift, p2_shift, p3_shift
コード例 #3
0
    def plot_antisense(self, antisense=None):

        apply_global_settings(titlepad=45)

        cluster_data = self.hc.clustered_data
        from src.datasets import read_orfs_data

        if antisense is None:
            antisense = read_orfs_data('%s/antisense_TPM.csv' % rna_dir)

        data = antisense.loc[cluster_data.index]
        data = data.join(cluster_data[['cluster']])

        fig, ax = plt.subplots(figsize=(7, 4))
        fig.tight_layout(rect=[0.05, 0.1, 0.95, 0.8])
        times = [0.0, 7.5, 15, 30, 60, 120]

        num_clusters = len(data.cluster.unique())

        for c in range(1, num_clusters + 1):

            c_data = data[data.cluster == c][times]
            for i in range(len(times)):
                time = times[i]
                cur = c_data[time].values
                lower = np.quantile(cur, 0.75)
                upper = np.quantile(cur, 0.25)
                median = np.median(cur)

                spacing = 0.13
                x = c + spacing * i - spacing * 2.5
                ax.plot([x, x], [lower, upper],
                        linewidth=3.,
                        color='#FF5C5C',
                        alpha=1,
                        solid_capstyle='butt')
                ax.scatter(x,
                           median,
                           s=6,
                           marker='D',
                           color='black',
                           zorder=10)

        ticks = np.arange(num_clusters + 1)
        ax.set_xticks(ticks)
        ax.set_xlim(0.5, num_clusters + 0.5)
        # ax.set_yticks(np.arange(0, 40, 10))

        ax.tick_params(axis='x', length=0, pad=10, labelsize=16)
        ax.tick_params(axis='y', labelsize=16)

        ax.set_ylabel('Transcripts per million', fontsize=18)
        ax.set_xlabel('Cluster', fontsize=18)
        ax.set_title('Antisense transcripts per cluster', fontsize=23)

        for x in np.arange(1, num_clusters):
            ax.axvline(x + 0.5, color='#d0d0d0', linewidth=1)
コード例 #4
0
def plot_xrate_vs_TPM(datastore, half_lifes=None):
    from config import rna_dir

    selected_genes = ['HSP26', 'RPS7A', 'CKB1']

    half_lives = read_orfs_data('data/half_life.csv')

    plot_data = datastore.transcript_rate_logfold[[120]]
    plot_data = plot_data.join(half_lives[['half_life']])
    plot_data = plot_data.sort_values('half_life')

    plot_data.loc[plot_data.index, 'TPM'] = \
        datastore.sense_TPM_logfold.loc[plot_data.index][120]

    if half_lifes is not None:
        plot_data = plot_data[(plot_data.half_life > half_lifes[0])
                              & (plot_data.half_life < half_lifes[1])]

    cor, pval = pearsonr(plot_data[120.0], plot_data.TPM)
    pval = convert_to_latex_sci_not(pval)

    title = 'Log$_2$ fold-change in transcription rate vs\n' \
            'log$_2$ fold-change in transcript level, 0-120\''
    title = ("%s\nPearson's r=%.2f, p=%s" % (title, cor, pval))

    fig, ax = plt.subplots(1, 1, figsize=(7, 6))
    fig.tight_layout(rect=[0.1, 0.1, 0.9, 0.85])

    ax.scatter(plot_data[120.0],
               plot_data.TPM,
               c='',
               edgecolor='#c0c0c0',
               s=5,
               rasterized=True)
    scatter = ax.scatter(plot_data[120.0],
                         plot_data.TPM,
                         c=plot_data['half_life'],
                         s=3,
                         cmap='magma_r',
                         vmin=0,
                         vmax=100,
                         rasterized=True)
    ax.set_xlabel('Log$_2$ fold-change transcription rate')
    ax.set_ylabel('Log$_2$ fold-change transcript level, TPM')
    plt.suptitle(title, fontsize=16)
    cbar = plt.colorbar(scatter)
    cbar.ax.set_title("Half life, min")

    if half_lifes is not None:
        for hl in half_lifes:
            cbar.ax.plot([0, 1.], [hl / 100., hl / 100.],
                         color='white',
                         linestyle='dashed',
                         zorder=100)
コード例 #5
0
def plot_coverage(misc_figures_dir):

    fig, ax = plt.subplots(figsize=(5, 4))
    fig.tight_layout(rect=[0.1, 0.15, 0.95, 0.8])

    mnase_coverage = read_orfs_data('%s/coverage_2000.csv' % mnase_dir)
    mnase_coverage = mnase_coverage[mnase_coverage.coverage > 0.8]
    ax.hist(mnase_coverage['coverage'], edgecolor='white', bins=25)
    ax.set_title('MNase-seq coverage\n[-1000, 1000] around TSS', fontsize=18)
    ax.axvline(x=0.85, color='red')
    ax.set_xlim(0.8, 1.0)
    ax.set_xlabel("Coverage", fontsize=15)
    ax.set_ylabel("# of genes", fontsize=15)
    plt.savefig("%s/coverage.pdf" % misc_figures_dir, transparent=True)
コード例 #6
0
    def plot_half_lifes(self):

        apply_global_settings(titlepad=20)

        cluster_data = self.hc.clustered_data
        from src.datasets import read_orfs_data
        half_lifes = read_orfs_data('data/half_life.csv')[['half_life']]

        data = half_lifes.loc[cluster_data.index]
        data = data.join(cluster_data[['cluster']])

        fig, ax = plt.subplots(figsize=(7, 5))
        fig.tight_layout(rect=[0.05, 0.1, 0.95, 0.8])
        times = [0.0, 7.5, 15, 30, 60, 120]

        for c in range(1, 8):
            cur = data[data.cluster == c].half_life
            lower = np.quantile(cur, 0.75)
            upper = np.quantile(cur, 0.25)
            median = np.median(cur)

            spacing = 0.13
            x = c + spacing * 3 - spacing * 2.5
            ax.plot([x, x], [lower, upper],
                    linewidth=6.,
                    color='#abd1fc',
                    alpha=1,
                    solid_capstyle='butt')
            ax.scatter(x, median, s=16, marker='D', color='black', zorder=10)

        ticks = np.arange(8)
        ax.set_xticks(ticks)
        ax.set_xlim(0.5, 7.5)
        # ax.set_yticks(np.arange(0, 200, 50))
        ax.set_ylim(0, 50)

        ax.tick_params(axis='x', length=0, pad=10, labelsize=16)
        ax.tick_params(axis='y', labelsize=16)

        ax.set_ylabel('Half life, min', fontsize=18)
        ax.set_xlabel('Cluster', fontsize=18)
        ax.set_title('Half lifes per cluster', fontsize=30)

        for x in np.arange(1, 8):
            ax.axvline(x + 0.5, color='#d0d0d0', linewidth=1)
コード例 #7
0
def plot_antisense_lengths():

    antisense_boundaries = read_orfs_data('%s/antisense_boundaries_computed.csv' % rna_dir)

    from src.plot_utils import apply_global_settings
    apply_global_settings()

    fig, ax = plt.subplots(figsize=(4.5, 3))
    fig.tight_layout(rect=[0.05, 0.05, 0.95, 0.9])

    antisense_lengths = (antisense_boundaries.stop - antisense_boundaries.start).dropna()

    ax.hist(antisense_lengths, 
             bins=25, linewidth=1, edgecolor='white')
    ax.set_title("Antisense transcript lengths, N=%d" % len(antisense_lengths),
                 fontsize=18)
    ax.set_xlabel("Length (bp)")
    ax.set_ylabel("# of genes")
コード例 #8
0
ファイル: gp.py プロジェクト: HarteminkLab/cadmium-paper
    def load_results(self, path):
        full_res = read_orfs_data(path).copy()
        Y = full_res[[]].copy()
        Y_pred = full_res[[]].copy()
        r2 = pd.DataFrame(index=self.times)
        r2['r2'] = 0.0

        for time in self.times:
            predicted = full_res['%.1f_predicted' % (time)]
            true = full_res['%.1f_true' % (time)]

            Y_pred.loc[:, time] = predicted
            Y.loc[:, time] = true

            r2.loc[time, 'r2'] = r2_score(true, predicted)

        self.Y = Y
        self.Y_predict = Y_pred
        self.r2 = r2
コード例 #9
0
    def plot_antisense_heatmap(self, orf_names=None):
        clustered_data = self.hc.clustered_data
        clustered_data = clustered_data.sort_values('cluster')

        antisense_TPM = read_orfs_data('%s/antisense_TPM.csv' % rna_dir)
        antisense_TPM = antisense_TPM.loc[clustered_data.index]

        if orf_names is not None:
            antisense_TPM = antisense_TPM.loc[orf_names]

        plt.figure(figsize=(4, 10))
        plot_data = np.log2(antisense_TPM + 1)

        plt.imshow(plot_data,
                   aspect=40. / len(plot_data),
                   vmin=0,
                   vmax=12,
                   cmap='viridis')
        plt.yticks([])
        plt.xticks([])
コード例 #10
0
def create_suppl_data():
    """Create chromatin data files for supplemental"""
    from src.chromatin_metrics_data import ChromatinDataStore

    # read relevant data
    datastore = ChromatinDataStore()
    promoter_sm_occupancy_raw = datastore.promoter_sm_occupancy_raw
    gene_body_organization_raw = datastore.gene_body_organization_raw
    sense_TPM = read_orfs_data('%s/sense_TPM.csv' % rna_dir)

    # fix header columns
    promoter_sm_occupancy_raw.columns = times_str
    gene_body_organization_raw.columns = times_str
    sense_TPM.columns = times_str

    # save to disk with fixed number of sigfigs
    promoter_sm_occupancy_raw.to_csv('%s/small_fragment_promoter_occupancy_all_times.csv'
        % mnase_dir)
    gene_body_organization_raw.to_csv(('%s/gene_body_nucleosome_disorganization_entropy_all_times.csv'
        % mnase_dir), float_format='%.4f')

    # save to disk with fixed number of sigfigs
    sense_TPM.to_csv(('%s/gene_expression_TPM_all_times.csv'
        % mnase_dir), float_format='%.4f')
コード例 #11
0
    def __init__(self, is_antisense=False, output_dir=None):

        self.is_antisense = is_antisense

        if is_antisense: strand_name = 'antisense'
        else: strand_name = 'sense'

        if output_dir is None:
            out_dir = OUTPUT_DIR
        else:
            out_dir = output_dir

        rna_dir = '%s/rna_seq' % out_dir
        mnase_dir = '%s/mnase_seq' % out_dir

        orfs = read_orfs_data('%s/orfs_cd_paper_dataset.csv' % out_dir)
        orfs_idx = orfs.index.values

        antisense_path = '%s/antisense_boundaries_computed.csv' % rna_dir
        antisense_TSS = read_orfs_data(antisense_path)

        if is_antisense:
            orfs_idx = antisense_TSS.dropna().index.values
            antisense_TPM_logfold = read_orfs_data(
                '%s/antisense_TPM_log2fold.csv' % rna_dir)
            self.antisense_TPM_logfold = antisense_TPM_logfold.loc[orfs_idx]
        else:
            sense_TPM_logfold = read_orfs_data('%s/sense_TPM_log2fold.csv' %
                                               rna_dir)
            self.sense_TPM_logfold = sense_TPM_logfold.loc[orfs_idx]

        xrate = read_orfs_data('%s/orf_xrates.csv' % rna_dir)
        xrate_logfold = read_orfs_data('%s/orf_xrates_log2fold.csv' % rna_dir)

        path = '%s/occupancies_%s.csv' % (mnase_dir, strand_name)
        occupancy = pd.read_csv(path)\
            .set_index(['orf_name', 'time'])
        self.occupancy = occupancy

        from src.entropy import load_orf_entropies
        from src.nucleosome_calling import load_p123

        (self.p1, self.p2, self.p3, self.p1_shift, self.p2_shift,
         self.p3_shift) = load_p123(strand_name)

        TPM = read_orfs_data('%s/sense_TPM.csv' % rna_dir)
        self.sense_TPM = TPM
        self.sense_log2_TPM = np.log2(TPM + 1)

        self.N = len(orfs_idx)

        # promoter occupancy (scale by length of 'promoter')
        self.promoter_sm_occupancy_raw = pivot_metric(occupancy.loc[orfs_idx],
                                                      '-200_0_len_0_100')
        self.promoter_sm_occupancy = pivot_metric(occupancy.loc[orfs_idx],
                                                  '-200_0_len_0_100') / 200.
        self.promoter_sm_occupancy = normalize_by_time(
            self.promoter_sm_occupancy)

        # promoter nucleosome occupancy (scale by length of 'promoter')
        self.promoter_nuc_occupancy_raw = pivot_metric(occupancy.loc[orfs_idx],
                                                       '-200_0_len_144_174')
        self.promoter_nuc_occupancy = self.promoter_nuc_occupancy_raw / 200.
        self.promoter_nuc_occupancy = normalize_by_time(
            self.promoter_nuc_occupancy)

        # gene body nucleosome occupancy (scale by length of 'gene body')
        self.gene_body_nuc_occupancy_raw = pivot_metric(
            occupancy.loc[orfs_idx], '0_500_len_144_174')
        self.gene_body_nuc_occupancy = self.gene_body_nuc_occupancy_raw / 200.
        self.gene_body_nuc_occupancy = normalize_by_time(
            self.gene_body_nuc_occupancy)

        # gene body organization
        gene_body_organization = load_orf_entropies('0_150',
                                                    'triple',
                                                    strand_name,
                                                    mnase_seq_dir=mnase_dir)
        self.gene_body_organization = gene_body_organization.copy(
        ).loc[orfs_idx]
        self.gene_body_organization_raw = self.gene_body_organization
        self.gene_body_organization = normalize_by_time(
            self.gene_body_organization)

        # scale by length of 'promoter'
        promoter_organization = load_orf_entropies('-200_0',
                                                   'triple',
                                                   strand_name,
                                                   mnase_seq_dir=mnase_dir)
        self.promoter_organization = promoter_organization.loc[orfs_idx]
        self.promoter_organization = normalize_by_time(
            self.promoter_organization)

        self.transcript_rate = xrate.copy().loc[orfs_idx]
        self.transcript_rate_logfold = xrate_logfold.loc[orfs_idx]

        self.promoter_sm_occupancy_delta = \
            difference(self.promoter_sm_occupancy)
        self.gene_body_disorganization_delta = \
            difference(self.gene_body_organization)
        self.promoter_disorganization_delta = \
            difference(self.promoter_organization)

        # other deltas
        self.promoter_nuc_occ_delta = \
            difference(self.promoter_nuc_occupancy)
        self.gene_body_nuc_occ_delta = \
            difference(self.gene_body_nuc_occupancy)

        self.orfs = orfs

        self.chromatin_data = self.promoter_sm_occupancy_delta.join(
            self.gene_body_disorganization_delta,
            lsuffix='_promoter',
            rsuffix='_gene')

        self.data = self.chromatin_data.join(self.transcript_rate_logfold,
                                             how='inner')
        self.xlabels = [
            'Small fragment\noccupancy', 'Nucleosome\ndisorganization',
            'Transcription\nrate'
        ]

        self.sort_data()
コード例 #12
0
# ---------- Analysis ---------------

gp_dir = '%s/gp/' % OUTPUT_DIR

p1_sense_path = '%s/p1_sense.csv' % mnase_dir
p2_sense_path = '%s/p2_sense.csv' % mnase_dir
p3_sense_path = '%s/p3_sense.csv' % mnase_dir

p1_antisense_path = '%s/p1_antisense.csv' % mnase_dir
p2_antisense_path = '%s/p2_antisense.csv' % mnase_dir
p3_antisense_path = '%s/p3_antisense.csv' % mnase_dir

# --------- Global data -----------------

from src.datasets import read_orfs_data
import os

# load paper orfs if possible
paper_orfs_path = "%s/orfs_cd_paper_dataset.csv" % OUTPUT_DIR
paper_orfs = None
if os.path.exists(paper_orfs_path):
    paper_orfs = read_orfs_data(paper_orfs_path)

# load antisense orfs if possible
antisense_orfs_path = '%s/antisense_boundaries_computed.csv' % rna_dir
antisense_orfs = None
if os.path.exists(antisense_orfs_path):
    antisense_orfs = read_orfs_data(antisense_orfs_path)

コード例 #13
0
def antisense_plots():

    from src.antisense_analysis import plot_antisense_vs_sense
    from src.antisense_analysis import plot_bar_counts, plot_antisense_dist

    save_dir = '%s/antisense' % OUTPUT_DIR
    mkdirs_safe([save_dir])

    antisense_TPM = read_orfs_data('%s/antisense_TPM.csv' % rna_dir)
    antisense_TPM_logfold = read_orfs_data('%s/antisense_TPM_log2fold.csv' %
                                           rna_dir)

    plot_antisense_vs_sense(
        antisense_TPM_logfold,
        datastore.transcript_rate_logfold,
        120.0,
        highlight=['MET31', 'CKB1', 'RPS7A', 'YBR241C', 'UTR2'])
    plt.savefig('%s/sense_antisense_distr.pdf' % save_dir,
                transparent=True,
                dpi=100)

    plot_bar_counts(antisense_TPM_logfold, datastore.transcript_rate_logfold)
    plt.savefig('%s/sense_antisense_counts.pdf' % save_dir)

    plot_antisense_dist(antisense_TPM_logfold)
    plt.savefig('%s/antisense_logfc_dist.pdf' % save_dir)

    from src.antisense_analysis import plot_antisense_lengths, plot_antisense_calling

    rna_seq_pileup = pd.read_hdf('%s/rna_seq_pileup.h5.z' % rna_dir, 'pileup')
    antisense_boundaries = read_orfs_data(
        '%s/antisense_boundaries_computed.csv' % rna_dir)

    plot_antisense_lengths()
    plt.savefig('%s/antisense_lengths_dist.pdf' % save_dir)

    plot_antisense_calling('MET31', rna_seq_pileup)
    plt.savefig('%s/antisense_met31_calling.pdf' % save_dir)

    from src.chromatin_summary_plots import plot_distribution

    anti_datastore = ChromatinDataStore(is_antisense=True)
    x = anti_datastore.promoter_sm_occupancy_delta.mean(axis=1)
    y = anti_datastore.antisense_TPM_logfold.mean(axis=1).loc[x.index]
    model = plot_distribution(
        x,
        y,
        '$\\Delta$ Antisense promoter occupancy',
        'Log$_2$ fold-change antisense transcript',
        highlight=[],
        title='Promoter occupancy vs transcription (Antisense)',
        xlim=(-1.5, 1.5),
        ylim=(-4, 4),
        xstep=0.5,
        ystep=1)
    plt.savefig('%s/antisense_chrom_dist_prom_vs_xrate.pdf' % save_dir)

    x = anti_datastore.gene_body_disorganization_delta.mean(axis=1).dropna()
    y = anti_datastore.antisense_TPM_logfold.loc[x.index].mean(
        axis=1).loc[x.index]

    model = plot_distribution(
        x,
        y,
        '$\\Delta$ antisense nucleosome disorganization',
        'Log$_2$ fold-change antisense transcripts',
        highlight=[],
        title='Nuc. disorganization vs transcription (Antisense)',
        xlim=(-1.5, 1.5),
        ylim=(-4, 4),
        xstep=0.5,
        ystep=1)
    plt.savefig('%s/antisense_chrom_dist_disorg_vs_xrate.pdf' % save_dir)
コード例 #14
0
from src.utils import print_fl
from src.datasets import read_orfs_data
from src.timer import Timer
from src.reference_data import (read_park_TSS_PAS, read_brogaard_nucleosomes,
                                read_macisaac_abf1_sites, read_sgd_orfs)

# global timer
timer = Timer()

# global inputs
rna_seq = None
all_mnase_data = None
mnase_coverage = None

all_orfs = read_sgd_orfs()
half_lives = read_orfs_data('data/half_life.csv')
TSSs = read_park_TSS_PAS()
orfs = all_orfs.join(TSSs[['TSS', 'PAS']])

if DEBUG:
    orfs = orfs[orfs.chr.isin(DEBUG_CHROMS)]


def read_input_data():

    global rna_seq
    global all_mnase_data

    print_fl("Reading RNA-seq...", end='')
    rna_seq = pd.read_hdf(rna_seq_path, 'rna_seq_data')
    print_fl("Done.")
コード例 #15
0
def plot_antisense_calling(gene_name, rna_seq_pileup):

    from src.rna_seq_plotter import get_smoothing_kernel
    from src.plot_utils import apply_global_settings
    from src.utils import get_orf
    from src.transcription import filter_rna_seq
    from src.transcription import filter_rna_seq_pileup
    from src.transcript_boundaries import load_park_boundaries
    from src.plot_orf_annotations import ORFAnnotationPlotter
    from config import paper_orfs
    from src.reference_data import read_sgd_orfs, read_park_TSS_PAS
    from src.datasets import read_orfs_data

    all_orfs = read_sgd_orfs()
    all_orfs = all_orfs.join(read_park_TSS_PAS()[['TSS', 'PAS']])

    orfs_plotter = ORFAnnotationPlotter(orfs=all_orfs)
    
    antisense_boundaries = read_orfs_data('%s/antisense_boundaries_computed.csv' % rna_dir)

    park_boundaries = load_park_boundaries()
    park_boundaries = park_boundaries.join(paper_orfs[['name']])

    orf = get_orf(gene_name, park_boundaries)

    search_2 = 1000
    span = orf.transcript_start-search_2, orf.transcript_stop+search_2
    gene_pileup = filter_rna_seq_pileup(rna_seq_pileup, 
    span[0], span[1], orf.chr)

    plot_span = span
    gene = orf
    gene_rna_seq = gene_pileup

    apply_global_settings(30)

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6, 5.))
    fig.tight_layout(rect=[0.1, 0, 1, 0.85])

    orfs_plotter.set_span_chrom(plot_span, gene.chr)
    orfs_plotter.plot_orf_annotations(ax1)

    sense_data = gene_rna_seq[gene_rna_seq.strand == '+']
    antisense_data = gene_rna_seq[gene_rna_seq.strand == '-']
    sense_data = np.log2(sense_data.groupby('position').sum()+1).pileup
    antisense_data = np.log2(antisense_data.groupby('position').sum()+1).pileup

    smooth_kernel = get_smoothing_kernel(100, 20)

    sense_strand = '+' if gene.strand == '+' else '-'
    antisense_strand = '+' if sense_strand == '-' else '-'

    x = sense_data.index
    sense_data = np.convolve(sense_data, smooth_kernel, mode='same')
    antisense_data = np.convolve(antisense_data, smooth_kernel, mode='same')

    ax2.plot(x, sense_data, color=plt.get_cmap('Blues')(0.5))
    ax2.plot(x, -antisense_data, color=plt.get_cmap('Reds')(0.5))
    ax2.set_xlim(*plot_span)
    ax2.set_ylim(-15, 15)
    ax2.axhline(0, color='black')

    if gene.name in antisense_boundaries.index:
        anti_gene = antisense_boundaries.loc[gene.name]
        
        y_plot = 0, 20 if gene.strand == '-' else -20, 0
        
        ax2.plot([anti_gene.start, anti_gene.start],
                [y_plot[0], y_plot[1]], color='red', linewidth=2.5, solid_capstyle='butt')
        ax2.plot([anti_gene.stop, anti_gene.stop],
                [y_plot[0], y_plot[1]], color='red', linewidth=2.5, solid_capstyle='butt')

    ax2.set_xticks(np.arange(plot_span[0], plot_span[1], 500))
    ax2.set_xticklabels([])
    _ = ax2.set_xticks(np.arange(plot_span[0], plot_span[1], 100), minor=True)

    ax2.tick_params(labelsize=14)
    ax2.set_ylabel("Sum log$_2$ (pileup+1)", fontsize=15)
    ax2.set_xlabel("Position (bp)", fontsize=15)

    ax1.set_title("Calling antisense transcripts", fontsize=26)

    ax2.axvline(383344)
    ax2.axvline(384114)
コード例 #16
0
ファイル: gp.py プロジェクト: HarteminkLab/cadmium-paper
    def design_matrix(self,
                      incl_times=[0, 7.5, 15, 30, 60, 120],
                      incl_prom=True,
                      incl_gene=True,
                      incl_occ=True,
                      incl_cc=True,
                      incl_small=True,
                      incl_nuc=True,
                      incl_sense=True,
                      incl_antisense=True,
                      incl_shift=False,
                      predict_TPM=True,
                      include_TPM_0=True,
                      scale=True,
                      logfold=True):

        if self.name == 'Intercept': include_TPM_0 = False

        orfs = paper_orfs

        # TODO: Testing to see how GPR performs on good set of genes @ 120' only
        # more complex subsetting if we want to subset different genes per
        # each time point
        if SUBSET_GPR_GENES:
            path = '%s/good_p1_nucs_gene_set_120.csv' % mnase_dir
            subset_idx = read_orfs_data(path).index.values
            orfs = orfs.loc[subset_idx]

            print_fl("Subsetting to well-positioned +1 nucleosomes, N=%d" %
                     len(orfs))

        orfs_idx = orfs.index.values

        X = orfs[[]].copy()

        if incl_sense:
            sense_X = self.load_design_matrix(incl_times=incl_times,
                                              incl_prom=incl_prom,
                                              incl_gene=incl_gene,
                                              incl_occ=incl_occ,
                                              incl_cc=incl_cc,
                                              incl_small=incl_small,
                                              incl_nuc=incl_nuc,
                                              incl_shift=incl_shift)
            X = X.join(sense_X)

        if incl_antisense:
            antisense_X = self.load_design_matrix(
                incl_times=incl_times,
                incl_prom=incl_prom,
                incl_gene=incl_gene,
                incl_occ=incl_occ,
                incl_cc=incl_cc,
                incl_small=incl_small,
                incl_nuc=incl_nuc,
                incl_shift=False,  # no antisense shift data
                antisense=True)
            X = X.join(antisense_X, lsuffix='_sense', rsuffix='_antisense')

        # load outcome
        # index = model.Y.index.values

        # predict absolute TPM level (log2)
        if predict_TPM:
            TPM = read_orfs_data('%s/sense_TPM.csv' % rna_dir).loc[orfs_idx]
            Y = np.log2(TPM + 0.1)

        # predict log2 fold change
        else:
            xrate = read_orfs_data('%s/orf_xrates.csv' % rna_dir)
            xrate_logfold = read_orfs_data('%s/orf_xrates_log2fold.csv' %
                                           rna_dir)
            Y = xrate_logfold.loc[orfs_idx]

        # add TPM at time 0
        if include_TPM_0:
            X['0.0_TPM'] = np.log2(TPM[0].copy())

        if self.sample_N is not None:
            np.random.seed(123)
            orfs_idx = X.index
            orfs_idx = np.random.choice(orfs_idx, self.sample_N, replace=False)

            X = X.loc[orfs_idx]
            Y = Y.loc[orfs_idx]

        # TODO: replace infinite values with 0
        X = X.replace([np.inf, -np.inf], 0.0)

        if len(X.columns) > 0:

            # logfold covariates
            if logfold:
                columns = X.columns
                # log transform cross correlation and occupancy columns
                logfold_cols = columns[columns.str.contains('_cc_')
                                       | columns.str.contains('_occ_')]
                X.loc[:, logfold_cols] = np.log2(X[logfold_cols] + 0.1)

            # scale covariates
            if scale:
                X.loc[:] = preprocessing.scale(X)

        X['intercept'] = 1

        self.X, self.Y = X, Y

        return X, Y