def load_data_no_impute_event(etype, event_id, map_event_to_file): # , sf_interest): df_list = list() path = map_event_to_file[etype] data = h5py.File(path, 'r') gene_idx = data['gene_idx'][event_id].astype('int') ensg_idx = data['gene_names'][gene_idx] lookup = names.get_lookup_complete() ensg_name = names.get_ID(ensg_idx, lookup=lookup) psi = data['psi'][:, event_id] columns = [_encode_event_name(re.sub(r'-', '_', ensg_name), etype, event_id)] strains = sf_utils.clean_strain(data['strains'][:]) df_list.append(pd.DataFrame(psi, index=strains, columns=columns)) df = pd.concat(df_list, axis=1) return df
def get_sf_interest(v=False): gene_name_list = ['PKM', 'MAX', 'NUMB', 'MKNK2', 'BIN1', 'RPS6KB1', 'BCL2L1', 'APC', 'PTEN', 'KLF6', 'CD44', 'CCK2', 'GHRH', 'CDKN2A', 'KIT'] sf_interest = dict() lookup = names.get_lookup_complete() for gene_name in gene_name_list: gene_ensg = names.get_ID(gene_name, which='ensembl', lookup=lookup) if not gene_ensg.upper().startswith('ENSG'): if v: print "WARNING: %s -> %s is unexpected and ignored" %(gene_name, gene_ensg) continue sf_interest[gene_name] = gene_ensg if v: print "Genes of interest:" display(sf_interest) return sf_interest
import sys import os import glob import scipy as sp import re import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import matplotlib.markers as markers from sklearn.decomposition import PCA sys.path.append('../icgc_utils') import names # get lookup table of gene names lookup = names.get_lookup_complete() CONF = 2 sys.path.append('..') from paths import BASEDIR, BASEDIR_AS basedir = os.path.join(BASEDIR_AS, 'alternative_splicing') metatable = os.path.join( BASEDIR, 'orig_data/metadata/per_aliquot_v2/rnaseq_metadata.histo.tsv') plotdir = os.path.join(basedir, 'plots', 'psi_deviation_histo') if not os.path.exists(plotdir): os.makedirs(plotdir) ### prep metadata metadata = []
npr.seed(23) import pickle import re import intervaltree as it import h5py import scipy.stats as spst sys.path.append('/cluster/home/akahles/git/software/spladder/python') import modules.utils as mu sys.path.append('../icgc_anno') import translate_metadata as tm sys.path.append('../icgc_utils') import names as un lookup = un.get_lookup_complete() sys.path.append('..') from paths import BASEDIR,BASEDIR_AS event_type = 'exon_skip' event_in = os.path.join(BASEDIR_AS, 'alternative_splicing', 'merge_graphs_%s_C2.pickle' % event_type) candidate_out = os.path.join(BASEDIR_AS, 'alternative_splicing', 'merge_graphs_%s_C2.exonize_candidates.pickle' % event_type) VARIANTS = os.path.join(BASEDIR, 'qtl_analysis/variants/mccalls/October_2016_whitelist_2583.snv_mnv_indel.sorted.sparse.hdf5') coding_genes = sp.loadtxt(os.path.join(BASEDIR, 'annotation/gencode.v19.annotation.hs37d5_chr.gtf.coding_genes.txt'), delimiter='\t', dtype='str') read_thresh = 3 ### prepare bam file dict bam_dict = dict() for line in open(os.path.join(BASEDIR_AS, 'alternative_splicing', 'sample_list_merged.txt'), 'r'):
outdir = os.path.join(paths.basedir_as, 'outliers') if not os.path.exists(outdir): os.makedirs(outdir) ### get TCGA type dictionary (ct_dict, is_tumor_dict) = utils.get_ct_dict_metatable(paths.metadata, style='pancan_rerun18') ### walk through all events and check whether a subset of samples consists of extreme outliers event_types = ['alt_3prime', 'alt_5prime', 'intron_retention', 'exon_skip', 'mutex_exons'] outliers = dict() out_thresh = 3 out_fact = 10 dpsi = 0.4 lookup = nm.get_lookup_complete() for et in event_types: outlier_cnt = 0 IN = h5py.File(os.path.join(paths.basedir_as, 'merge_graphs_%s_C%i.counts.hdf5' % (et, CONF)), 'r') IN_GT = h5py.File(os.path.join(paths.basedir_as_gtex, 'merge_graphs_%s_C%i.counts.hdf5' % (et, CONF)), 'r') strains = sp.array([x.split('.')[1] for x in IN['strains'][:]]) ### apply whitelist if use_wl: whitelist = sp.loadtxt(paths.whitelist, delimiter='\t', dtype='str') whitelist = sp.array([x.split('.')[1] for x in whitelist]) widx = sp.in1d(strains, whitelist) strains = strains[widx] else: widx = sp.arange(strains)