Esempio n. 1
0
def load_data_no_impute_event(etype, event_id, map_event_to_file): # , sf_interest):
    df_list = list()
    path = map_event_to_file[etype]

    data = h5py.File(path, 'r')
    gene_idx = data['gene_idx'][event_id].astype('int')
    ensg_idx = data['gene_names'][gene_idx]
    lookup = names.get_lookup_complete()
    ensg_name = names.get_ID(ensg_idx, lookup=lookup)
    psi = data['psi'][:, event_id]
    columns = [_encode_event_name(re.sub(r'-', '_', ensg_name), etype, event_id)]
    strains = sf_utils.clean_strain(data['strains'][:])
    df_list.append(pd.DataFrame(psi, index=strains, columns=columns))
    df = pd.concat(df_list, axis=1)
    return df
Esempio n. 2
0
def get_sf_interest(v=False):
    gene_name_list = ['PKM', 'MAX', 'NUMB', 'MKNK2', 'BIN1',
        'RPS6KB1', 'BCL2L1', 'APC', 'PTEN',
        'KLF6', 'CD44', 'CCK2', 'GHRH',
        'CDKN2A', 'KIT']
    sf_interest = dict()
    lookup = names.get_lookup_complete()
    for gene_name in gene_name_list:
        gene_ensg = names.get_ID(gene_name, which='ensembl', lookup=lookup)
        if not gene_ensg.upper().startswith('ENSG'):
            if v: print "WARNING: %s -> %s is unexpected and ignored" %(gene_name, gene_ensg)
            continue
        sf_interest[gene_name] = gene_ensg
    if v:
        print "Genes of interest:"
        display(sf_interest)
    return sf_interest
Esempio n. 3
0
import sys
import os
import glob
import scipy as sp
import re

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.markers as markers
from sklearn.decomposition import PCA

sys.path.append('../icgc_utils')
import names
# get lookup table of gene names
lookup = names.get_lookup_complete()

CONF = 2

sys.path.append('..')
from paths import BASEDIR, BASEDIR_AS

basedir = os.path.join(BASEDIR_AS, 'alternative_splicing')
metatable = os.path.join(
    BASEDIR, 'orig_data/metadata/per_aliquot_v2/rnaseq_metadata.histo.tsv')
plotdir = os.path.join(basedir, 'plots', 'psi_deviation_histo')
if not os.path.exists(plotdir):
    os.makedirs(plotdir)

### prep metadata
metadata = []
npr.seed(23)
import pickle
import re
import intervaltree as it
import h5py
import scipy.stats as spst

sys.path.append('/cluster/home/akahles/git/software/spladder/python')
import modules.utils as mu

sys.path.append('../icgc_anno')
import translate_metadata as tm

sys.path.append('../icgc_utils')
import names as un
lookup = un.get_lookup_complete()

sys.path.append('..')
from paths import BASEDIR,BASEDIR_AS

event_type = 'exon_skip'
event_in = os.path.join(BASEDIR_AS, 'alternative_splicing', 'merge_graphs_%s_C2.pickle' % event_type)
candidate_out = os.path.join(BASEDIR_AS, 'alternative_splicing', 'merge_graphs_%s_C2.exonize_candidates.pickle' % event_type)

VARIANTS = os.path.join(BASEDIR, 'qtl_analysis/variants/mccalls/October_2016_whitelist_2583.snv_mnv_indel.sorted.sparse.hdf5')
coding_genes = sp.loadtxt(os.path.join(BASEDIR, 'annotation/gencode.v19.annotation.hs37d5_chr.gtf.coding_genes.txt'), delimiter='\t', dtype='str')     
read_thresh = 3

### prepare bam file dict
bam_dict = dict()
for line in open(os.path.join(BASEDIR_AS, 'alternative_splicing', 'sample_list_merged.txt'), 'r'):
outdir = os.path.join(paths.basedir_as, 'outliers')
if not os.path.exists(outdir):
    os.makedirs(outdir)

### get TCGA type dictionary
(ct_dict, is_tumor_dict) = utils.get_ct_dict_metatable(paths.metadata, style='pancan_rerun18')

### walk through all events and check whether a subset of samples consists of extreme outliers
event_types = ['alt_3prime', 'alt_5prime', 'intron_retention', 'exon_skip', 'mutex_exons']

outliers = dict()
out_thresh = 3
out_fact = 10
dpsi = 0.4

lookup = nm.get_lookup_complete()

for et in event_types:
    outlier_cnt = 0
    IN = h5py.File(os.path.join(paths.basedir_as, 'merge_graphs_%s_C%i.counts.hdf5' % (et, CONF)), 'r')
    IN_GT = h5py.File(os.path.join(paths.basedir_as_gtex, 'merge_graphs_%s_C%i.counts.hdf5' % (et, CONF)), 'r')
    strains = sp.array([x.split('.')[1] for x in IN['strains'][:]])

    ### apply whitelist
    if use_wl:
        whitelist = sp.loadtxt(paths.whitelist, delimiter='\t', dtype='str')
        whitelist = sp.array([x.split('.')[1] for x in whitelist])
        widx = sp.in1d(strains, whitelist)
        strains = strains[widx]
    else:
        widx = sp.arange(strains)