def load_data(self): save_dir = '%s/tf_analysis' % OUTPUT_DIR self.all_peaks = pd.read_csv('%s/all_peaks.csv' % save_dir)\ .set_index('name') self.linked_peaks = read_orfs_data( '%s/linked_peaks.csv' % save_dir, 'name') self.linked_peaks_normalized = read_orfs_data( '%s/linked_peaks_norm.csv' % save_dir, 'name') self.prom_peaks = pd.read_csv('%s/prom_peaks.csv' % save_dir)\ .set_index('Unnamed: 0') self.all_motifs = pd.read_csv('%s/all_motifs.csv' % save_dir)\ .set_index('Unnamed: 0') # rename tf motifs to match gene names data = self.all_motifs rename = { 'RCS1': 'AFT1', 'YML081W': 'TDA9' } for k, v in rename.items(): selected = data.tf == k data.loc[selected, 'tf'] = v self.summarize_tfs()
def load_p123(strand_name): from config import mnase_dir from src.datasets import read_orfs_data from src.transformations import difference p1_positions = read_orfs_data('%s/p1_%s.csv' % (mnase_dir, strand_name)) p2_positions = read_orfs_data('%s/p2_%s.csv' % (mnase_dir, strand_name)) p3_positions = read_orfs_data('%s/p3_%s.csv' % (mnase_dir, strand_name)) p12 = p1_positions.join(p2_positions, lsuffix='_+1', rsuffix='_+2') p23 = p2_positions.join(p3_positions, lsuffix='_+2', rsuffix='_+3') valid_12_orfs = validate_pair(p12, '+1', '+2') valid_23_orfs = validate_pair(p23, '+2', '+3') valid_orfs = list(set(valid_12_orfs).intersection(set(valid_23_orfs))) p1_shift = difference(p1_positions.loc[valid_orfs]) p2_shift = difference(p2_positions.loc[valid_orfs]) p3_shift = difference(p3_positions.loc[valid_orfs]) p1 = p1_positions.loc[valid_orfs] p2 = p2_positions.loc[valid_orfs] p3 = p3_positions.loc[valid_orfs] return p1, p2, p3, p1_shift, p2_shift, p3_shift
def plot_antisense(self, antisense=None): apply_global_settings(titlepad=45) cluster_data = self.hc.clustered_data from src.datasets import read_orfs_data if antisense is None: antisense = read_orfs_data('%s/antisense_TPM.csv' % rna_dir) data = antisense.loc[cluster_data.index] data = data.join(cluster_data[['cluster']]) fig, ax = plt.subplots(figsize=(7, 4)) fig.tight_layout(rect=[0.05, 0.1, 0.95, 0.8]) times = [0.0, 7.5, 15, 30, 60, 120] num_clusters = len(data.cluster.unique()) for c in range(1, num_clusters + 1): c_data = data[data.cluster == c][times] for i in range(len(times)): time = times[i] cur = c_data[time].values lower = np.quantile(cur, 0.75) upper = np.quantile(cur, 0.25) median = np.median(cur) spacing = 0.13 x = c + spacing * i - spacing * 2.5 ax.plot([x, x], [lower, upper], linewidth=3., color='#FF5C5C', alpha=1, solid_capstyle='butt') ax.scatter(x, median, s=6, marker='D', color='black', zorder=10) ticks = np.arange(num_clusters + 1) ax.set_xticks(ticks) ax.set_xlim(0.5, num_clusters + 0.5) # ax.set_yticks(np.arange(0, 40, 10)) ax.tick_params(axis='x', length=0, pad=10, labelsize=16) ax.tick_params(axis='y', labelsize=16) ax.set_ylabel('Transcripts per million', fontsize=18) ax.set_xlabel('Cluster', fontsize=18) ax.set_title('Antisense transcripts per cluster', fontsize=23) for x in np.arange(1, num_clusters): ax.axvline(x + 0.5, color='#d0d0d0', linewidth=1)
def plot_xrate_vs_TPM(datastore, half_lifes=None): from config import rna_dir selected_genes = ['HSP26', 'RPS7A', 'CKB1'] half_lives = read_orfs_data('data/half_life.csv') plot_data = datastore.transcript_rate_logfold[[120]] plot_data = plot_data.join(half_lives[['half_life']]) plot_data = plot_data.sort_values('half_life') plot_data.loc[plot_data.index, 'TPM'] = \ datastore.sense_TPM_logfold.loc[plot_data.index][120] if half_lifes is not None: plot_data = plot_data[(plot_data.half_life > half_lifes[0]) & (plot_data.half_life < half_lifes[1])] cor, pval = pearsonr(plot_data[120.0], plot_data.TPM) pval = convert_to_latex_sci_not(pval) title = 'Log$_2$ fold-change in transcription rate vs\n' \ 'log$_2$ fold-change in transcript level, 0-120\'' title = ("%s\nPearson's r=%.2f, p=%s" % (title, cor, pval)) fig, ax = plt.subplots(1, 1, figsize=(7, 6)) fig.tight_layout(rect=[0.1, 0.1, 0.9, 0.85]) ax.scatter(plot_data[120.0], plot_data.TPM, c='', edgecolor='#c0c0c0', s=5, rasterized=True) scatter = ax.scatter(plot_data[120.0], plot_data.TPM, c=plot_data['half_life'], s=3, cmap='magma_r', vmin=0, vmax=100, rasterized=True) ax.set_xlabel('Log$_2$ fold-change transcription rate') ax.set_ylabel('Log$_2$ fold-change transcript level, TPM') plt.suptitle(title, fontsize=16) cbar = plt.colorbar(scatter) cbar.ax.set_title("Half life, min") if half_lifes is not None: for hl in half_lifes: cbar.ax.plot([0, 1.], [hl / 100., hl / 100.], color='white', linestyle='dashed', zorder=100)
def plot_coverage(misc_figures_dir): fig, ax = plt.subplots(figsize=(5, 4)) fig.tight_layout(rect=[0.1, 0.15, 0.95, 0.8]) mnase_coverage = read_orfs_data('%s/coverage_2000.csv' % mnase_dir) mnase_coverage = mnase_coverage[mnase_coverage.coverage > 0.8] ax.hist(mnase_coverage['coverage'], edgecolor='white', bins=25) ax.set_title('MNase-seq coverage\n[-1000, 1000] around TSS', fontsize=18) ax.axvline(x=0.85, color='red') ax.set_xlim(0.8, 1.0) ax.set_xlabel("Coverage", fontsize=15) ax.set_ylabel("# of genes", fontsize=15) plt.savefig("%s/coverage.pdf" % misc_figures_dir, transparent=True)
def plot_half_lifes(self): apply_global_settings(titlepad=20) cluster_data = self.hc.clustered_data from src.datasets import read_orfs_data half_lifes = read_orfs_data('data/half_life.csv')[['half_life']] data = half_lifes.loc[cluster_data.index] data = data.join(cluster_data[['cluster']]) fig, ax = plt.subplots(figsize=(7, 5)) fig.tight_layout(rect=[0.05, 0.1, 0.95, 0.8]) times = [0.0, 7.5, 15, 30, 60, 120] for c in range(1, 8): cur = data[data.cluster == c].half_life lower = np.quantile(cur, 0.75) upper = np.quantile(cur, 0.25) median = np.median(cur) spacing = 0.13 x = c + spacing * 3 - spacing * 2.5 ax.plot([x, x], [lower, upper], linewidth=6., color='#abd1fc', alpha=1, solid_capstyle='butt') ax.scatter(x, median, s=16, marker='D', color='black', zorder=10) ticks = np.arange(8) ax.set_xticks(ticks) ax.set_xlim(0.5, 7.5) # ax.set_yticks(np.arange(0, 200, 50)) ax.set_ylim(0, 50) ax.tick_params(axis='x', length=0, pad=10, labelsize=16) ax.tick_params(axis='y', labelsize=16) ax.set_ylabel('Half life, min', fontsize=18) ax.set_xlabel('Cluster', fontsize=18) ax.set_title('Half lifes per cluster', fontsize=30) for x in np.arange(1, 8): ax.axvline(x + 0.5, color='#d0d0d0', linewidth=1)
def plot_antisense_lengths(): antisense_boundaries = read_orfs_data('%s/antisense_boundaries_computed.csv' % rna_dir) from src.plot_utils import apply_global_settings apply_global_settings() fig, ax = plt.subplots(figsize=(4.5, 3)) fig.tight_layout(rect=[0.05, 0.05, 0.95, 0.9]) antisense_lengths = (antisense_boundaries.stop - antisense_boundaries.start).dropna() ax.hist(antisense_lengths, bins=25, linewidth=1, edgecolor='white') ax.set_title("Antisense transcript lengths, N=%d" % len(antisense_lengths), fontsize=18) ax.set_xlabel("Length (bp)") ax.set_ylabel("# of genes")
def load_results(self, path): full_res = read_orfs_data(path).copy() Y = full_res[[]].copy() Y_pred = full_res[[]].copy() r2 = pd.DataFrame(index=self.times) r2['r2'] = 0.0 for time in self.times: predicted = full_res['%.1f_predicted' % (time)] true = full_res['%.1f_true' % (time)] Y_pred.loc[:, time] = predicted Y.loc[:, time] = true r2.loc[time, 'r2'] = r2_score(true, predicted) self.Y = Y self.Y_predict = Y_pred self.r2 = r2
def plot_antisense_heatmap(self, orf_names=None): clustered_data = self.hc.clustered_data clustered_data = clustered_data.sort_values('cluster') antisense_TPM = read_orfs_data('%s/antisense_TPM.csv' % rna_dir) antisense_TPM = antisense_TPM.loc[clustered_data.index] if orf_names is not None: antisense_TPM = antisense_TPM.loc[orf_names] plt.figure(figsize=(4, 10)) plot_data = np.log2(antisense_TPM + 1) plt.imshow(plot_data, aspect=40. / len(plot_data), vmin=0, vmax=12, cmap='viridis') plt.yticks([]) plt.xticks([])
def create_suppl_data(): """Create chromatin data files for supplemental""" from src.chromatin_metrics_data import ChromatinDataStore # read relevant data datastore = ChromatinDataStore() promoter_sm_occupancy_raw = datastore.promoter_sm_occupancy_raw gene_body_organization_raw = datastore.gene_body_organization_raw sense_TPM = read_orfs_data('%s/sense_TPM.csv' % rna_dir) # fix header columns promoter_sm_occupancy_raw.columns = times_str gene_body_organization_raw.columns = times_str sense_TPM.columns = times_str # save to disk with fixed number of sigfigs promoter_sm_occupancy_raw.to_csv('%s/small_fragment_promoter_occupancy_all_times.csv' % mnase_dir) gene_body_organization_raw.to_csv(('%s/gene_body_nucleosome_disorganization_entropy_all_times.csv' % mnase_dir), float_format='%.4f') # save to disk with fixed number of sigfigs sense_TPM.to_csv(('%s/gene_expression_TPM_all_times.csv' % mnase_dir), float_format='%.4f')
def __init__(self, is_antisense=False, output_dir=None): self.is_antisense = is_antisense if is_antisense: strand_name = 'antisense' else: strand_name = 'sense' if output_dir is None: out_dir = OUTPUT_DIR else: out_dir = output_dir rna_dir = '%s/rna_seq' % out_dir mnase_dir = '%s/mnase_seq' % out_dir orfs = read_orfs_data('%s/orfs_cd_paper_dataset.csv' % out_dir) orfs_idx = orfs.index.values antisense_path = '%s/antisense_boundaries_computed.csv' % rna_dir antisense_TSS = read_orfs_data(antisense_path) if is_antisense: orfs_idx = antisense_TSS.dropna().index.values antisense_TPM_logfold = read_orfs_data( '%s/antisense_TPM_log2fold.csv' % rna_dir) self.antisense_TPM_logfold = antisense_TPM_logfold.loc[orfs_idx] else: sense_TPM_logfold = read_orfs_data('%s/sense_TPM_log2fold.csv' % rna_dir) self.sense_TPM_logfold = sense_TPM_logfold.loc[orfs_idx] xrate = read_orfs_data('%s/orf_xrates.csv' % rna_dir) xrate_logfold = read_orfs_data('%s/orf_xrates_log2fold.csv' % rna_dir) path = '%s/occupancies_%s.csv' % (mnase_dir, strand_name) occupancy = pd.read_csv(path)\ .set_index(['orf_name', 'time']) self.occupancy = occupancy from src.entropy import load_orf_entropies from src.nucleosome_calling import load_p123 (self.p1, self.p2, self.p3, self.p1_shift, self.p2_shift, self.p3_shift) = load_p123(strand_name) TPM = read_orfs_data('%s/sense_TPM.csv' % rna_dir) self.sense_TPM = TPM self.sense_log2_TPM = np.log2(TPM + 1) self.N = len(orfs_idx) # promoter occupancy (scale by length of 'promoter') self.promoter_sm_occupancy_raw = pivot_metric(occupancy.loc[orfs_idx], '-200_0_len_0_100') self.promoter_sm_occupancy = pivot_metric(occupancy.loc[orfs_idx], '-200_0_len_0_100') / 200. self.promoter_sm_occupancy = normalize_by_time( self.promoter_sm_occupancy) # promoter nucleosome occupancy (scale by length of 'promoter') self.promoter_nuc_occupancy_raw = pivot_metric(occupancy.loc[orfs_idx], '-200_0_len_144_174') self.promoter_nuc_occupancy = self.promoter_nuc_occupancy_raw / 200. self.promoter_nuc_occupancy = normalize_by_time( self.promoter_nuc_occupancy) # gene body nucleosome occupancy (scale by length of 'gene body') self.gene_body_nuc_occupancy_raw = pivot_metric( occupancy.loc[orfs_idx], '0_500_len_144_174') self.gene_body_nuc_occupancy = self.gene_body_nuc_occupancy_raw / 200. self.gene_body_nuc_occupancy = normalize_by_time( self.gene_body_nuc_occupancy) # gene body organization gene_body_organization = load_orf_entropies('0_150', 'triple', strand_name, mnase_seq_dir=mnase_dir) self.gene_body_organization = gene_body_organization.copy( ).loc[orfs_idx] self.gene_body_organization_raw = self.gene_body_organization self.gene_body_organization = normalize_by_time( self.gene_body_organization) # scale by length of 'promoter' promoter_organization = load_orf_entropies('-200_0', 'triple', strand_name, mnase_seq_dir=mnase_dir) self.promoter_organization = promoter_organization.loc[orfs_idx] self.promoter_organization = normalize_by_time( self.promoter_organization) self.transcript_rate = xrate.copy().loc[orfs_idx] self.transcript_rate_logfold = xrate_logfold.loc[orfs_idx] self.promoter_sm_occupancy_delta = \ difference(self.promoter_sm_occupancy) self.gene_body_disorganization_delta = \ difference(self.gene_body_organization) self.promoter_disorganization_delta = \ difference(self.promoter_organization) # other deltas self.promoter_nuc_occ_delta = \ difference(self.promoter_nuc_occupancy) self.gene_body_nuc_occ_delta = \ difference(self.gene_body_nuc_occupancy) self.orfs = orfs self.chromatin_data = self.promoter_sm_occupancy_delta.join( self.gene_body_disorganization_delta, lsuffix='_promoter', rsuffix='_gene') self.data = self.chromatin_data.join(self.transcript_rate_logfold, how='inner') self.xlabels = [ 'Small fragment\noccupancy', 'Nucleosome\ndisorganization', 'Transcription\nrate' ] self.sort_data()
# ---------- Analysis --------------- gp_dir = '%s/gp/' % OUTPUT_DIR p1_sense_path = '%s/p1_sense.csv' % mnase_dir p2_sense_path = '%s/p2_sense.csv' % mnase_dir p3_sense_path = '%s/p3_sense.csv' % mnase_dir p1_antisense_path = '%s/p1_antisense.csv' % mnase_dir p2_antisense_path = '%s/p2_antisense.csv' % mnase_dir p3_antisense_path = '%s/p3_antisense.csv' % mnase_dir # --------- Global data ----------------- from src.datasets import read_orfs_data import os # load paper orfs if possible paper_orfs_path = "%s/orfs_cd_paper_dataset.csv" % OUTPUT_DIR paper_orfs = None if os.path.exists(paper_orfs_path): paper_orfs = read_orfs_data(paper_orfs_path) # load antisense orfs if possible antisense_orfs_path = '%s/antisense_boundaries_computed.csv' % rna_dir antisense_orfs = None if os.path.exists(antisense_orfs_path): antisense_orfs = read_orfs_data(antisense_orfs_path)
def antisense_plots(): from src.antisense_analysis import plot_antisense_vs_sense from src.antisense_analysis import plot_bar_counts, plot_antisense_dist save_dir = '%s/antisense' % OUTPUT_DIR mkdirs_safe([save_dir]) antisense_TPM = read_orfs_data('%s/antisense_TPM.csv' % rna_dir) antisense_TPM_logfold = read_orfs_data('%s/antisense_TPM_log2fold.csv' % rna_dir) plot_antisense_vs_sense( antisense_TPM_logfold, datastore.transcript_rate_logfold, 120.0, highlight=['MET31', 'CKB1', 'RPS7A', 'YBR241C', 'UTR2']) plt.savefig('%s/sense_antisense_distr.pdf' % save_dir, transparent=True, dpi=100) plot_bar_counts(antisense_TPM_logfold, datastore.transcript_rate_logfold) plt.savefig('%s/sense_antisense_counts.pdf' % save_dir) plot_antisense_dist(antisense_TPM_logfold) plt.savefig('%s/antisense_logfc_dist.pdf' % save_dir) from src.antisense_analysis import plot_antisense_lengths, plot_antisense_calling rna_seq_pileup = pd.read_hdf('%s/rna_seq_pileup.h5.z' % rna_dir, 'pileup') antisense_boundaries = read_orfs_data( '%s/antisense_boundaries_computed.csv' % rna_dir) plot_antisense_lengths() plt.savefig('%s/antisense_lengths_dist.pdf' % save_dir) plot_antisense_calling('MET31', rna_seq_pileup) plt.savefig('%s/antisense_met31_calling.pdf' % save_dir) from src.chromatin_summary_plots import plot_distribution anti_datastore = ChromatinDataStore(is_antisense=True) x = anti_datastore.promoter_sm_occupancy_delta.mean(axis=1) y = anti_datastore.antisense_TPM_logfold.mean(axis=1).loc[x.index] model = plot_distribution( x, y, '$\\Delta$ Antisense promoter occupancy', 'Log$_2$ fold-change antisense transcript', highlight=[], title='Promoter occupancy vs transcription (Antisense)', xlim=(-1.5, 1.5), ylim=(-4, 4), xstep=0.5, ystep=1) plt.savefig('%s/antisense_chrom_dist_prom_vs_xrate.pdf' % save_dir) x = anti_datastore.gene_body_disorganization_delta.mean(axis=1).dropna() y = anti_datastore.antisense_TPM_logfold.loc[x.index].mean( axis=1).loc[x.index] model = plot_distribution( x, y, '$\\Delta$ antisense nucleosome disorganization', 'Log$_2$ fold-change antisense transcripts', highlight=[], title='Nuc. disorganization vs transcription (Antisense)', xlim=(-1.5, 1.5), ylim=(-4, 4), xstep=0.5, ystep=1) plt.savefig('%s/antisense_chrom_dist_disorg_vs_xrate.pdf' % save_dir)
from src.utils import print_fl from src.datasets import read_orfs_data from src.timer import Timer from src.reference_data import (read_park_TSS_PAS, read_brogaard_nucleosomes, read_macisaac_abf1_sites, read_sgd_orfs) # global timer timer = Timer() # global inputs rna_seq = None all_mnase_data = None mnase_coverage = None all_orfs = read_sgd_orfs() half_lives = read_orfs_data('data/half_life.csv') TSSs = read_park_TSS_PAS() orfs = all_orfs.join(TSSs[['TSS', 'PAS']]) if DEBUG: orfs = orfs[orfs.chr.isin(DEBUG_CHROMS)] def read_input_data(): global rna_seq global all_mnase_data print_fl("Reading RNA-seq...", end='') rna_seq = pd.read_hdf(rna_seq_path, 'rna_seq_data') print_fl("Done.")
def plot_antisense_calling(gene_name, rna_seq_pileup): from src.rna_seq_plotter import get_smoothing_kernel from src.plot_utils import apply_global_settings from src.utils import get_orf from src.transcription import filter_rna_seq from src.transcription import filter_rna_seq_pileup from src.transcript_boundaries import load_park_boundaries from src.plot_orf_annotations import ORFAnnotationPlotter from config import paper_orfs from src.reference_data import read_sgd_orfs, read_park_TSS_PAS from src.datasets import read_orfs_data all_orfs = read_sgd_orfs() all_orfs = all_orfs.join(read_park_TSS_PAS()[['TSS', 'PAS']]) orfs_plotter = ORFAnnotationPlotter(orfs=all_orfs) antisense_boundaries = read_orfs_data('%s/antisense_boundaries_computed.csv' % rna_dir) park_boundaries = load_park_boundaries() park_boundaries = park_boundaries.join(paper_orfs[['name']]) orf = get_orf(gene_name, park_boundaries) search_2 = 1000 span = orf.transcript_start-search_2, orf.transcript_stop+search_2 gene_pileup = filter_rna_seq_pileup(rna_seq_pileup, span[0], span[1], orf.chr) plot_span = span gene = orf gene_rna_seq = gene_pileup apply_global_settings(30) fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6, 5.)) fig.tight_layout(rect=[0.1, 0, 1, 0.85]) orfs_plotter.set_span_chrom(plot_span, gene.chr) orfs_plotter.plot_orf_annotations(ax1) sense_data = gene_rna_seq[gene_rna_seq.strand == '+'] antisense_data = gene_rna_seq[gene_rna_seq.strand == '-'] sense_data = np.log2(sense_data.groupby('position').sum()+1).pileup antisense_data = np.log2(antisense_data.groupby('position').sum()+1).pileup smooth_kernel = get_smoothing_kernel(100, 20) sense_strand = '+' if gene.strand == '+' else '-' antisense_strand = '+' if sense_strand == '-' else '-' x = sense_data.index sense_data = np.convolve(sense_data, smooth_kernel, mode='same') antisense_data = np.convolve(antisense_data, smooth_kernel, mode='same') ax2.plot(x, sense_data, color=plt.get_cmap('Blues')(0.5)) ax2.plot(x, -antisense_data, color=plt.get_cmap('Reds')(0.5)) ax2.set_xlim(*plot_span) ax2.set_ylim(-15, 15) ax2.axhline(0, color='black') if gene.name in antisense_boundaries.index: anti_gene = antisense_boundaries.loc[gene.name] y_plot = 0, 20 if gene.strand == '-' else -20, 0 ax2.plot([anti_gene.start, anti_gene.start], [y_plot[0], y_plot[1]], color='red', linewidth=2.5, solid_capstyle='butt') ax2.plot([anti_gene.stop, anti_gene.stop], [y_plot[0], y_plot[1]], color='red', linewidth=2.5, solid_capstyle='butt') ax2.set_xticks(np.arange(plot_span[0], plot_span[1], 500)) ax2.set_xticklabels([]) _ = ax2.set_xticks(np.arange(plot_span[0], plot_span[1], 100), minor=True) ax2.tick_params(labelsize=14) ax2.set_ylabel("Sum log$_2$ (pileup+1)", fontsize=15) ax2.set_xlabel("Position (bp)", fontsize=15) ax1.set_title("Calling antisense transcripts", fontsize=26) ax2.axvline(383344) ax2.axvline(384114)
def design_matrix(self, incl_times=[0, 7.5, 15, 30, 60, 120], incl_prom=True, incl_gene=True, incl_occ=True, incl_cc=True, incl_small=True, incl_nuc=True, incl_sense=True, incl_antisense=True, incl_shift=False, predict_TPM=True, include_TPM_0=True, scale=True, logfold=True): if self.name == 'Intercept': include_TPM_0 = False orfs = paper_orfs # TODO: Testing to see how GPR performs on good set of genes @ 120' only # more complex subsetting if we want to subset different genes per # each time point if SUBSET_GPR_GENES: path = '%s/good_p1_nucs_gene_set_120.csv' % mnase_dir subset_idx = read_orfs_data(path).index.values orfs = orfs.loc[subset_idx] print_fl("Subsetting to well-positioned +1 nucleosomes, N=%d" % len(orfs)) orfs_idx = orfs.index.values X = orfs[[]].copy() if incl_sense: sense_X = self.load_design_matrix(incl_times=incl_times, incl_prom=incl_prom, incl_gene=incl_gene, incl_occ=incl_occ, incl_cc=incl_cc, incl_small=incl_small, incl_nuc=incl_nuc, incl_shift=incl_shift) X = X.join(sense_X) if incl_antisense: antisense_X = self.load_design_matrix( incl_times=incl_times, incl_prom=incl_prom, incl_gene=incl_gene, incl_occ=incl_occ, incl_cc=incl_cc, incl_small=incl_small, incl_nuc=incl_nuc, incl_shift=False, # no antisense shift data antisense=True) X = X.join(antisense_X, lsuffix='_sense', rsuffix='_antisense') # load outcome # index = model.Y.index.values # predict absolute TPM level (log2) if predict_TPM: TPM = read_orfs_data('%s/sense_TPM.csv' % rna_dir).loc[orfs_idx] Y = np.log2(TPM + 0.1) # predict log2 fold change else: xrate = read_orfs_data('%s/orf_xrates.csv' % rna_dir) xrate_logfold = read_orfs_data('%s/orf_xrates_log2fold.csv' % rna_dir) Y = xrate_logfold.loc[orfs_idx] # add TPM at time 0 if include_TPM_0: X['0.0_TPM'] = np.log2(TPM[0].copy()) if self.sample_N is not None: np.random.seed(123) orfs_idx = X.index orfs_idx = np.random.choice(orfs_idx, self.sample_N, replace=False) X = X.loc[orfs_idx] Y = Y.loc[orfs_idx] # TODO: replace infinite values with 0 X = X.replace([np.inf, -np.inf], 0.0) if len(X.columns) > 0: # logfold covariates if logfold: columns = X.columns # log transform cross correlation and occupancy columns logfold_cols = columns[columns.str.contains('_cc_') | columns.str.contains('_occ_')] X.loc[:, logfold_cols] = np.log2(X[logfold_cols] + 0.1) # scale covariates if scale: X.loc[:] = preprocessing.scale(X) X['intercept'] = 1 self.X, self.Y = X, Y return X, Y