Example #1
0
def load_data_no_impute_event(etype, event_id, map_event_to_file): # , sf_interest):
    df_list = list()
    path = map_event_to_file[etype]

    data = h5py.File(path, 'r')
    gene_idx = data['gene_idx'][event_id].astype('int')
    ensg_idx = data['gene_names'][gene_idx]
    lookup = names.get_lookup_complete()
    ensg_name = names.get_ID(ensg_idx, lookup=lookup)
    psi = data['psi'][:, event_id]
    columns = [_encode_event_name(re.sub(r'-', '_', ensg_name), etype, event_id)]
    strains = sf_utils.clean_strain(data['strains'][:])
    df_list.append(pd.DataFrame(psi, index=strains, columns=columns))
    df = pd.concat(df_list, axis=1)
    return df
Example #2
0
def get_sf_interest(v=False):
    gene_name_list = ['PKM', 'MAX', 'NUMB', 'MKNK2', 'BIN1',
        'RPS6KB1', 'BCL2L1', 'APC', 'PTEN',
        'KLF6', 'CD44', 'CCK2', 'GHRH',
        'CDKN2A', 'KIT']
    sf_interest = dict()
    lookup = names.get_lookup_complete()
    for gene_name in gene_name_list:
        gene_ensg = names.get_ID(gene_name, which='ensembl', lookup=lookup)
        if not gene_ensg.upper().startswith('ENSG'):
            if v: print "WARNING: %s -> %s is unexpected and ignored" %(gene_name, gene_ensg)
            continue
        sf_interest[gene_name] = gene_ensg
    if v:
        print "Genes of interest:"
        display(sf_interest)
    return sf_interest
Example #3
0
        merged_ids = merged_ids[s_idx, :]
        genes = genes[s_idx]

        print("Writing output files")
        merged_bin = (merged > 4.5).astype('int').astype('str')
        merged_bin = sp.r_[strains[sp.newaxis, :], merged_bin]
        merged_bin = sp.c_[sp.r_[['gene_id'], genes], merged_bin]

        merged_median = sp.median(merged, axis=1).astype('str')
        merged_median = sp.append('median_dev', merged_median)
        merged_median = sp.c_[
            sp.append('gene_id', sp.array([x.split('.')[0] for x in genes])),
            merged_median]
        merged_median = sp.c_[sp.append(
            'gene_name',
            sp.array([names.get_ID(x.split('.')[0], lookup) for x in genes])),
                              merged_median]

        merged_str = merged.astype('str')
        merged_str = sp.r_[strains[sp.newaxis, :], merged_str]
        merged_str = sp.c_[sp.r_[['gene_id'], genes], merged_str]

        merged_ids_str = merged_ids.astype('str')
        merged_ids_str = sp.r_[strains[sp.newaxis, :], merged_ids_str]
        merged_ids_str = sp.c_[sp.r_[['gene_id'], genes], merged_ids_str]

        if not os.path.exists(os.path.join(basedir, outfile_pat[p])):
            sp.savetxt(os.path.join(basedir, outfile_pat[p]),
                       merged_str,
                       fmt='%s',
                       delimiter='\t')
alt_projects = sp.array(alt_projects)
var_pos = sp.array(var_pos)
alt_files = sp.array(alt_files)
ref_files = sp.array(ref_files)

k_idx = sp.where(sp.in1d(keep_idx, conf_idx))[0]
keep_idx = keep_idx[k_idx]
delta_psi = delta_psi[k_idx]
alt_donors = alt_donors[k_idx]
ref_donors = ref_donors[k_idx]
alt_projects = alt_projects[k_idx]
var_pos = var_pos[k_idx]
alt_files = alt_files[k_idx]
ref_files = ref_files[k_idx]

event_pos = sp.array([EV['event_pos'][i, :] for i in keep_idx])
gene_idx = sp.array([EV['gene_idx'][i] for i in keep_idx], dtype='int')
gene_strand = sp.array([EV['gene_strand'][i] for i in gene_idx])
gene_ids = sp.array([EV['gene_names'][i] for i in gene_idx])
gene_names = sp.array([un.get_ID(_, lookup=lookup) for _ in gene_ids])
is_coding = sp.array(['coding' if _ in coding_genes else 'non-coding' for _ in gene_ids])
event_chr = sp.array([EV['gene_chr'][i] for i in gene_idx])

s_idx = sp.argsort(delta_psi)[::-1]
#           0        1         2          3--8       9         10          11         12          13          14            15         16         17           18
res = sp.c_[var_pos, keep_idx, event_chr, event_pos, gene_ids, gene_names, delta_psi, alt_donors, ref_donors, alt_projects, alt_files, ref_files, gene_strand, is_coding][s_idx, :]
sp.savetxt(os.path.join(BASEDIR_AS, 'alternative_splicing', 'exonization_candidates_C2.txt'), res, fmt='%s', delimiter='\t')
EV.close()


    strains = sp.array([x.split('.')[1] for x in IN['strains'][:]])

    ### apply whitelist
    if use_wl:
        whitelist = sp.loadtxt(paths.whitelist, delimiter='\t', dtype='str')
        whitelist = sp.array([x.split('.')[1] for x in whitelist])
        widx = sp.in1d(strains, whitelist)
        strains = strains[widx]
    else:
        widx = sp.arange(strains)
        
    ctypes = sp.array([ct_dict[x] for x in strains])
    is_tumor = sp.array([is_tumor_dict[x] for x in strains])
    ctypes_u, ctypes_cnt = sp.unique(ctypes, return_counts=True)
    gene_ids = IN['gene_names'][:]
    gene_names = sp.array([nm.get_ID(x, lookup=lookup) for x in gene_ids])
    gene_idx = IN['gene_idx'][:].astype('int')

    ### only keep cancer types where we have at least 100 samples
    k_idx = sp.where(ctypes_cnt >= 100)[0]
    ctypes_u = ctypes_u[k_idx]

    outliers[et] = dict([(x, []) for x in ctypes_u])
    chunks = IN['psi'].chunks

    for cc, c in enumerate(range(0, IN['psi'].shape[1], chunks[1])): 
        sys.stdout.write('%i/%i - outliers: %i\n' % (cc, IN['psi'].shape[1] / chunks[1], outlier_cnt))
        sys.stdout.flush()
        cidx = sp.arange(c, min(c + chunks[1], IN['psi'].shape[1]))
        tmp = IN['psi'][:, cidx][widx, :]
        tmp_gt = IN_GT['psi'][:, cidx]
Example #6
0
    rows = alt_donors.shape[0] + 1
    height_ratios = [3 for _ in range(rows)]
    height_ratios[-1] = 1
    gs = gridspec.GridSpec(rows, 1, height_ratios=height_ratios)
    fig = plt.figure(figsize=(16, 3 * rows), dpi=200)
    #fig.suptitle('%s in %s (Variant: %s:%s)' % (results[i, 6], results[i, 7],results[i, 0],  results[i, 2]), fontsize=12, y=1.03)

    for d, donor in enumerate(alt_donors):

        ### plot coverage profile
        ax = fig.add_subplot(gs[d, 0])
        cax = []
        labels = []
        #ax.set_title('%s - Gene: %s (%s) - SNV: %s - in frame: %s' % (results[i, 14], un.get_ID(results[i, 9], lookup=lookup), results[i, 17], results[i, 0], in_frame), fontsize=10)
        ax.set_title('Gene: %s -in frame: %s' %
                     (un.get_ID(results[i, 8], lookup=lookup), in_frame),
                     fontsize=10)
        cax.append(
            cv.cov_from_bam(event_chr,
                            event_pos.min(),
                            event_pos.max(), [files_alt[d]],
                            color_cov='r',
                            ax=ax,
                            intron_cnt=True,
                            color_intron_edge='r',
                            log=('--log' in sys.argv),
                            intron_filter=intron_filter,
                            return_legend_handle=True,
                            label='Alt: %s' % donor))
        labels.append('Alt: %s' % donor)
        if files_ref[d] != 'NA':
Example #7
0
    event_chr = str(results[i, 2])

    rows = alt_donors.shape[0] + 1
    height_ratios = [3 for _ in range(rows)]
    height_ratios[-1] = 1
    gs = gridspec.GridSpec(rows, 1, height_ratios=height_ratios)
    fig = plt.figure(figsize = (8, 3 * rows), dpi=200)
    #fig.suptitle('%s in %s (Variant: %s:%s)' % (results[i, 6], results[i, 7],results[i, 0],  results[i, 2]), fontsize=12, y=1.03)
        
    for d, donor in enumerate(alt_donors):

        ### plot coverage profile
        ax = fig.add_subplot(gs[d, 0])
        cax = []
        labels = []
        ax.set_title('%s - Gene: %s (%s) - SNV: %s - in frame: %s' % (results[i, 14], un.get_ID(results[i, 9], lookup=lookup), results[i, 17], results[i, 0], in_frame), fontsize=10)
        cax.append(cv.cov_from_bam(event_chr, event_pos.min(), event_pos.max(), [files_alt[d]], color_cov='r', ax=ax, intron_cnt=True, color_intron_edge='r', log=('--log' in sys.argv), intron_filter=intron_filter, return_legend_handle=True, label='Alt: %s' % donor))
        labels.append('Alt: %s' % donor)
        if files_ref[d] != 'NA':
            cax.append(cv.cov_from_bam(event_chr, event_pos.min(), event_pos.max(), [files_ref[d]], color_cov='0.0', ax=ax, intron_cnt=True, color_intron_edge='0.0', log=('--log' in sys.argv), intron_filter=intron_filter, return_legend_handle=True, label='Ref: %s' % ref_donors[d])) 
            labels.append('Ref: %s' % ref_donors[d])
        plt.legend(cax, labels, fontsize=10)
        for vp in var_pos:
            ax.plot(vp, 0, 'bo', markersize=1)
        ylim = ax.get_ylim()
        xlim = ax.get_xlim()
        xspan = xlim[1] - xlim[0]
        yspan = ylim[1] - ylim[0]
        for vp in var_pos:
            ax.arrow(vp, yspan * 0.2, 0, yspan * -0.15, head_width=0.01*xspan, head_length=0.01*yspan, fc='b') 
        x_range = ax.get_xlim()
    rows = alt_donors.shape[0] + 1
    height_ratios = [3 for _ in range(rows)]
    height_ratios[-1] = 1
    gs = gridspec.GridSpec(rows, 1, height_ratios=height_ratios)
    fig = plt.figure(figsize=(16, 3 * rows), dpi=200)
    #fig.suptitle('%s in %s (Variant: %s:%s)' % (results[i, 6], results[i, 7],results[i, 0],  results[i, 2]), fontsize=12, y=1.03)

    for d, donor in enumerate(alt_donors):

        ### plot coverage profile
        ax = fig.add_subplot(gs[d, 0])
        cax = []
        labels = []
        ax.set_title('%s - Gene: %s (%s) - SNV: %s - in frame: %s' %
                     (results[i, 14], un.get_ID(results[i, 9], lookup=lookup),
                      results[i, 17], results[i, 0], in_frame),
                     fontsize=10)
        cax.append(
            cv.cov_from_bam(event_chr,
                            event_pos.min(),
                            event_pos.max(), [files_alt[d]],
                            color_cov='r',
                            ax=ax,
                            intron_cnt=True,
                            color_intron_edge='r',
                            log=('--log' in sys.argv),
                            intron_filter=intron_filter,
                            return_legend_handle=True,
                            label='Alt: %s' % donor))
        labels.append('Alt: %s' % donor)