def plot_long_RNA_peak(peaks, ax, ce, top_n=10, y_val='log10p'): lp = peaks[peaks.sense_gtype.str.contains('Long RNA')] \ .query('sample_count >= %i' %sample_cutoff)\ .groupby('sense_gname', as_index=False)\ .apply(pick_lp) rfam_labs = defaultdict(lambda: 'Others') #get_peak_rfam_annotation(lp) rfam_labs['CPN1'] = 'tRNA-lookalike' rfam_labs['CASKIN2'] = 'Excised structured intron RNA' rfam_labs['DAPK1'] = 'miRNA-like' rfam_labs['RP11-51O6.1'] = 'Pseudogene' assert (y_val in ['log10p', 'pileup']) name_conversion = NameConversion() rev_name_conversion = {v: k for k, v in name_conversion.encoder.items()} lp = lp\ .assign(picked_RNA_sense = lambda d: d.sense_gname.map(name_conversion.convert).str.replace('-NPIPA8','')) \ .groupby('picked_RNA_sense')\ .apply(lambda d: d.nlargest(1, y_val))\ .nlargest(top_n, y_val) \ .pipe(cat_long_rna_type)\ .sort_values(y_val, ascending=False) colors = lp.rt.map(peak_type_ce.encoder).values sns.barplot(data=lp, x='picked_RNA_sense', y=y_val, palette=colors, ax=ax) ax.legend().set_visible(False) ax.set_xlabel('') if y_val == 'log10p': ax.set_ylabel('-$log_{10}$ p-value', fontsize=20) else: ax.set_ylabel('Coverage', fontsize=20) ax.set_xticklabels(ax.get_xticklabels(), rotation=70, rotation_mode='anchor', ha='right') used_rfam = [] for i, xt in enumerate(ax.get_xticklabels()): gn = xt.get_text() if gn in rev_name_conversion.keys(): gn = rev_name_conversion[gn] rfam = rfam_labs[gn] used_rfam.append(rfam) used = lp.rt.unique() cc_ce = color_encoder() cc_ce.encoder = { k: v for k, v in peak_type_ce.encoder.items() if k in used } cc_ce.show_legend(ax=ax, frameon=False, fontsize=20) for col, xt in zip(colors, ax.get_xticklabels()): xt.set_color(col)
def plot_pymc_bar(ax): ce = color_encoder() bar_df = get_pymc_df()\ .query('bayes_factor > 3')\ .nlargest(15,'delta')\ .assign(color = lambda d: ce.fit_transform(d['is_telo']))\ .assign(delta = lambda d: d.delta*100) bar_df.plot\ .bar('gene_name','delta',color = bar_df.color.tolist(), ax = ax) ce.show_legend(ax, frameon=False, fontsize=20, bbox_to_anchor=(0.5, 0.7)) xts = [] for xt in ax.get_xticklabels(): xtext = xt.get_text().split(':')[1] xts.append((xtext)) ax.set_xticklabels(xts, rotation=70, rotation_mode='anchor', ha='right') ax.set_xlabel('') ax.set_ylabel('$\Delta$ % Plus strand')
return prep_order[3] else: return x def rename_sample(xs): sample_dict = defaultdict(int) out_name = [] for x in xs: prep = label_sample(x) sample_dict[prep] += 1 out_name.append('%s %i' % (prep, sample_dict[prep])) return out_name label_ce = color_encoder() label_ce.encoder = {} for label, color in zip([ 'DNase I', 'DNase I + Exo I', 'DNase I + NaOH', 'DNase I + Exo I + NaOH', 'NaOH', 'Untreated', 'Ladder', 'Fragmented', "DNase I - 3'P", 'HEK293' ], [ '#d12604', '#ff96cb', '#964b06', '#f2a157', '#4286f4', 'black', 'grey', '#592782', '#870c47', 'black' ]): label_ce.encoder[label] = color RNA_type = [ 'Antisense', 'Mt', 'Other ncRNA', 'Other sncRNA', 'Protein coding', 'Repeats', 'miRNA', 'rRNA', 'snoRNA', 'tRNA', 'Vault RNA', 'Unannotated', '5/5.8S rRNA', '18/28S rRNA', 'Mt-tRNA'
import numpy as np import re from sklearn.linear_model import LinearRegression, Ridge from sklearn.model_selection import KFold, LeaveOneOut, GridSearchCV from sklearn.preprocessing import LabelEncoder from sklearn.metrics import r2_score from sequencing_tools.viz_tools import color_encoder, okabeito_palette, simpsons_palette import seaborn as sns import matplotlib.patches as mpatches import matplotlib.pyplot as plt from scipy.stats import pearsonr from operator import itemgetter import os import pysam from collections import Counter end_ce = color_encoder() end_ce.fit(["3' end", "5' end"],['darkgoldenrod','purple']) def positioning(x): return x[-1] def count_to_cpm(count_array): count_array = np.true_divide(count_array,count_array.sum()) * 1e6 return count_array def get_end(x): if 'head' in x: return "5' N+" elif 'tail' in x: return "3' N-"
from collections import defaultdict from sequencing_tools.viz_tools import okabeito_palette, \ simpsons_palette, \ color_encoder from sequencing_tools.io_tools import ReadPicardRNA from collections import defaultdict import re import glob import os from plotting_utils import label_sample, rename_sample, \ label_ce, rna_type_ce, \ figure_path, work_path from functools import lru_cache plt.rc('font', **{'family': 'sans-serif', 'sans-serif': 'Arial'}) small_RNA_ce = color_encoder() label_order = [ 'Untreated', 'NaOH', 'WGS-sim', 'DNase I', 'DNase I + Exo I', "DNase I - 3'P" ] metric_path = work_path + '/cfNA/tgirt_map/merged_bam/filtered_bam' metrics = glob.glob(metric_path + '/*.RNA_Metrics') metrics = list(filter(lambda x: 'sense' not in x, metrics)) def read_metric(metric): return pd.read_table(metric, skiprows=6, nrows=1)\ .pipe(pd.melt) \ .pipe(lambda d: d[d.variable.str.contains('TRANSCRIPT_STRAND_')])\ .pipe(lambda d: d[d.variable.str.contains('PCT')])
import matplotlib.pyplot as plt sys.path.insert(0, '/stor/home/cdw2854/cfNA/peak_callings') from structural_peaks import PeakAnalyzer, mRNAFilter, GenicIntersect, NameConversion, GeneMapper, TrnaLookAlike from exon_coverage import ExonFilter import dask.dataframe as dd plt.rc('axes', labelsize=20) plt.rc('xtick', labelsize=20) plt.rc('ytick', labelsize=20) plt.rc('font', **{'family': 'sans-serif', 'sans-serif': 'Arial'}) pileup_cutoff = 5 sample_cutoff = 5 project_path = '/stor/work/Lambowitz/cdw2854/cfNA/tgirt_map' project_path = '/stor/work/Lambowitz/yaojun/Work/cfNA/tgirt_map' peak_path = project_path + '/bed_files/merged_bed/MACS2/annotated' peak_type_ce = color_encoder() peak_type_ce.encoder = { 'mRNA': 'purple', 'Pseudogene': 'darkblue', 'Exon': 'purple', 'Intron': '#fca237', 'Exon-intron': '#7bb73e', 'Within intron': '#f9b109', 'Stem-loop': '#f9b109', 'miRNA': 'darkgreen', 'rRNA': '#15879b', 'Mismapped': '#bcbb76', 'Others': 'black', 'Intergenic': 'black', 'tRNA-lookalike': 'red', 'Full-length intron': '#725001',
import numpy as np from sequencing_tools.viz_tools import color_encoder, okabeito_palette from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize, OneHotEncoder, LabelBinarizer from sklearn.feature_extraction import DictVectorizer from sklearn.decomposition import PCA, FactorAnalysis import matplotlib.patches as mpatches from sklearn.model_selection import LeaveOneOut, KFold, train_test_split, GridSearchCV from sklearn.metrics import roc_auc_score, roc_curve from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline from helper_function import * import glob import os pca_ce = color_encoder() def pca_color(labs, colors): return {lab: col for lab, col in zip(labs, colors)} def pca_biplot(train_df, ax): pca = PCA(n_components=3) train_df = train_df \ .query('label !="Zero" & label != ""') tdf = extract_train_cols(train_df) d = pca.fit_transform(StandardScaler().fit_transform(tdf)) pca_df = pd.DataFrame(d) pca_df.columns = ['PC%i' % (int(col) + 1) for col in pca_df.columns] pca_df['label'] = train_df.label.values