def quantify_exon_skip(event, gene, counts_segments, counts_edges, CFG): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph if CFG['is_matlab']: seg_lens = segs[0, 0][1, :] - segs[0, 0][0, :] seg_shape = segs[0, 2].shape order = 'F' offset = 1 ### find exons corresponding to event idx_exon_pre = sp.where((sg[0, 0][0, :] == event.exon_pre[0]) & (sg[0, 0][1, :] == event.exon_pre[1]))[0] idx_exon = sp.where((sg[0, 0][0, :] == event.exon[0]) & (sg[0, 0][1, :] == event.exon[1]))[0] idx_exon_aft = sp.where((sg[0, 0][0, :] == event.exon_aft[0]) & (sg[0, 0][1, :] == event.exon_aft[1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs[0, 1][idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs[0, 1][idx_exon_aft, :])[1]) seg_exon = sp.sort(sp.where(segs[0, 1][idx_exon, :])[1]) else: seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[2, 0]) & (sg.vertices[1, :] == event.exons2[2, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon = sp.sort(sp.where(segs.seg_match[idx_exon, :])[1]) # get inner exon cov cov[0] = sp.sum(counts_segments[seg_exon] * seg_lens[seg_exon]) /sp.sum(seg_lens[seg_exon]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon[0]], seg_shape, order=order) + offset)[0] cov[0] += counts_edges[idx1, 1] # exon_exon_aft_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] cov[0] += counts_edges[idx2, 1] # exon_pre_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] cov[1] = counts_edges[idx3, 1] return cov
def quantify_mult_exon_skip(event, gene, counts_segments, counts_edges): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[-1, 0]) & (sg.vertices[1, :] == event.exons2[-1, 1]))[0] seg_exons = [] for i in range(1, event.exons2.shape[0] - 1): tmp = sp.where((sg.vertices[0, :] == event.exons2[i, 0]) & (sg.vertices[1, :] == event.exons2[i, 1]))[0] seg_exons.append(sp.where(segs.seg_match[tmp, :])[1]) ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exons_u = sp.sort(sp.unique([x for sublist in seg_exons for x in sublist])) ### inner exons_cov cov[0] = sp.sum(counts_segments[seg_exons_u] * seg_lens[seg_exons_u]) / sp.sum(seg_lens[seg_exons_u]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exons[0][0]], seg_shape, order=order) + offset)[0] if len(idx1.shape) > 0 and idx1.shape[0] > 0: cov[0] += counts_edges[idx1[0], 1] # exon_exon_aft_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exons[-1][-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx2.shape) > 0 and idx2.shape[0] > 0: cov[0] += counts_edges[idx2[0], 1] # exon_pre_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx3.shape) > 0 and idx3.shape[0] > 0: cov[1] = counts_edges[idx3[0], 1] for i in range(len(seg_exons) - 1): # sum_inner_exon_conf idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exons[i][-1], seg_exons[i+1][0]], seg_shape, order=order) + offset)[0] if len(idx4.shape) > 0 and idx4.shape[0] > 0: cov[0] += counts_edges[idx4[0], 1] return cov
def coordinates_to_voxel_idx(coords_xyz, masker): # transform to homogeneous coordinates coords_h_xyz = sp.append(coords_xyz, ones([1,coords_xyz.shape[1]]),axis=0) # apply inverse affine transformation to get homogeneous coordinates in voxel space inv_transf = sp.linalg.inv(masker.volume.get_affine()) coords_h_voxel_space = inv_transf.dot(coords_h_xyz) coords_h_voxel_space = sp.rint(coords_h_voxel_space).astype(int) # remove homogeneous dimension coords_voxel_space = coords_h_voxel_space[0:-1,:] # convert coordinates to idcs in a flattened voxel space flattened_idcs = sp.ravel_multi_index(coords_voxel_space, masker.dims) # check if there is any study data for the flattened idcs voxel_idcs = sp.zeros((1,len(flattened_idcs)),dtype=int64) for i in range(0,len(flattened_idcs)): idcs = find(masker.in_mask == flattened_idcs[i]) if len(idcs > 0): voxel_idcs[0,i] = find(masker.in_mask == flattened_idcs[i]) else: voxel_idcs[0,i] = nan return voxel_idcs
def quantify_intron_retention(event, gene, counts_segments, counts_edges, counts_seg_pos): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape order = 'C' offset = 0 ### find exons corresponding to event idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] ### find segments corresponding to exons seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) seg_all = sp.arange(seg_exon1[0], seg_exon2[-1]) seg_intron = sp.setdiff1d(seg_all, seg_exon1) seg_intron = sp.setdiff1d(seg_intron, seg_exon2) assert(seg_intron.shape[0] > 0) ### compute exon coverages as mean of position wise coverage # intron_cov cov[0] = sp.sum(counts_segments[seg_intron] * seg_lens[seg_intron]) / sp.sum(seg_lens[seg_intron]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon1[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0] cov[1] = counts_edges[idx, 1] return cov
def quantify_mutex_exons(event, gene, counts_segments, counts_edges): sg = gene.splicegraph segs = gene.segmentgraph seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons1[-1, 0]) & (sg.vertices[1, :] == event.exons1[-1, 1]))[0] idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) # exon1 cov cov[0] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum(seg_lens[seg_exon1]) # exon2 cov cov[1] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum(seg_lens[seg_exon2]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon1_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon1[0]], seg_shape, order=order) + offset)[0] if len(idx1.shape) > 0 and idx1.shape[0] > 0: cov[0] += counts_edges[idx1[0], 1] # exon_pre_exon2_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0] if len(idx2.shape) > 0 and idx2.shape[0] > 0: cov[1] += counts_edges[idx2[0], 1] # exon1_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon1[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx3.shape) > 0 and idx3.shape[0] > 0: cov[0] += counts_edges[idx3[0], 1] # exon2_exon_aft_conf idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon2[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx4.shape) > 0 and idx4.shape[0] > 0: cov[1] += counts_edges[idx4[0], 1] return cov
def replace_sub_matrix(mat_in, idx, mat_put): """Replaces the values in mat_in in rows and cols idx with values of mat_put""" assert((idx.shape[0] * idx.shape[0]) == mat_put.ravel().shape[0]) sp.put(mat_in, sp.ravel_multi_index([[x for x in idx for _ in idx], [x for _ in idx for x in idx]], (mat_in.shape[0], mat_in.shape[1])), mat_put.ravel()) return mat_in
def verify_alt_prime(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_exon_skip(event, fn_bam, cfg) # (0) valid, (1) exon_diff_cov, (2) exon_const_cov # (3) intron1_conf, (4) intron2_conf info = [1, 0, 0, 0, 0] verified = [0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of intron coordinates (only one side is differing) if (event.exons1[0, 1] != event.exons2[0, 1]) and (event.exons1[1, 0] != event.exons2[1, 0]): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon11 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] if idx_exon11.shape[0] == 0: segs_exon11 = sp.where((segs.segments[0, :] >= event.exons1[0, 0]) & (segs.segments[1, :] <= event.exons1[0, 1]))[0] else: segs_exon11 = sp.where(segs.seg_match[idx_exon11, :])[1] idx_exon12 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] if idx_exon12.shape[0] == 0: segs_exon12 = sp.where((segs.segments[0, :] >= event.exons1[1, 0]) & (segs.segments[1, :] <= event.exons1[1, 1]))[0] else: segs_exon12 = sp.where(segs.seg_match[idx_exon12, :])[1] idx_exon21 = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] if idx_exon21.shape[0] == 0: segs_exon21 = sp.where((segs.segments[0, :] >= event.exons2[0, 0]) & (segs.segments[1, :] <= event.exons2[0, 1]))[0] else: segs_exon21 = sp.where(segs.seg_match[idx_exon21, :])[1] idx_exon22 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] if idx_exon22.shape[0] == 0: segs_exon22 = sp.where((segs.segments[0, :] >= event.exons2[1, 0]) & (segs.segments[1, :] <= event.exons2[1, 1]))[0] else: segs_exon22 = sp.where(segs.seg_match[idx_exon22, :] > 0)[1] assert (segs_exon11.shape[0] > 0) assert (segs_exon12.shape[0] > 0) assert (segs_exon21.shape[0] > 0) assert (segs_exon22.shape[0] > 0) if sp.all(segs_exon11 == segs_exon21): seg_exon_const = segs_exon11 seg_diff = sp.setdiff1d(segs_exon12, segs_exon22) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon22, segs_exon12) seg_const = sp.intersect1d(segs_exon12, segs_exon22) elif sp.all(segs_exon12 == segs_exon22): seg_exon_const = segs_exon12 seg_diff = sp.setdiff1d(segs_exon11, segs_exon21) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon21, segs_exon11) seg_const = sp.intersect1d(segs_exon21, segs_exon11) else: print >> sys.stderr, "ERROR: both exons differ in alt prime event in verify_alt_prime" sys.exit(1) seg_const = sp.r_[seg_exon_const, seg_const] seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon_diff_cov info[1] = sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum( seg_lens[seg_diff]) # exon_const_cov info[2] = sp.sum(counts_segments[seg_const] * seg_lens[seg_const]) / sp.sum(seg_lens[seg_const]) if info[1] >= CFG['alt_prime']['min_diff_rel_cov'] * info[2]: verified[0] = 1 ### check intron confirmations as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon11[-1], segs_exon12[0]], segs.seg_edges.shape))[0] assert (idx.shape[0] > 0) info[3] = counts_edges[idx, 1] # intron2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon21[-1], segs_exon22[0]], segs.seg_edges.shape))[0] assert (idx.shape[0] > 0) info[4] = counts_edges[idx, 1] if min(info[3], info[4]) >= CFG['alt_prime']['min_intron_count']: verified[1] = 1 return (verified, info)
def verify_exon_skip(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_exon_skip(event, fn_bam, CFG) verified = [0, 0, 0, 0] # (0) valid, (1) exon_cov, (2) exon_pre_cov, (3) exon_aft_cov, # (4) exon_pre_exon_conf, (5) exon_exon_aft_conf, (6) exon_pre_exon_aft_conf info = [1, 0, 0, 0, 0, 0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = False return (verified, info) ### check validity of exon coordinates (start < stop && non-overlapping) elif sp.any(event.exons1[:, 1] - event.exons1[:, 0] < 1) or sp.any( event.exons2[:, 1] - event.exons2[:, 0] < 1): info[0] = False return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[2, 0]) & (sg.vertices[1, :] == event.exons2[2, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon = sp.sort(sp.where(segs.seg_match[idx_exon, :])[1]) seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon pre cov info[2] = sp.sum(counts_segments[seg_exon_pre] * seg_lens[seg_exon_pre]) / sp.sum(seg_lens[seg_exon_pre]) # exon aft cov info[3] = sp.sum(counts_segments[seg_exon_aft] * seg_lens[seg_exon_aft]) / sp.sum(seg_lens[seg_exon_aft]) # exon cov info[1] = sp.sum(counts_segments[seg_exon] * seg_lens[seg_exon]) / sp.sum( seg_lens[seg_exon]) ### check if coverage of skipped exon is >= than FACTOR times average of pre and after if info[1] >= CFG['exon_skip']['min_skip_rel_cov'] * (info[2] + info[3]) / 2: verified[0] = 1 ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon[0]], segs.seg_edges.shape))[0] info[4] = counts_edges[idx, 1] if info[4] >= CFG['exon_skip']['min_non_skip_count']: verified[1] = 1 # exon_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] info[5] = counts_edges[idx, 1] if info[5] >= CFG['exon_skip']['min_non_skip_count']: verified[2] = 1 # exon_pre_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] info[6] = counts_edges[idx, 1] if info[6] >= CFG['exon_skip']['min_skip_count']: verified[3] = 1 return (verified, info)
def generate_node_connectivity_array(index_map, data_array): r""" Generates a node connectivity array based on faces, edges and corner adjacency """ # logger.info('generating network connections...') # # setting up some constants x_dim, y_dim, z_dim = data_array.shape conn_map = list(product([0, -1, 1], [0, -1, 1], [0, -1, 1])) conn_map = sp.array(conn_map, dtype=int) conn_map = conn_map[1:] # # creating slice list to process data chunks slice_list = [slice(0, 10000)] for i in range(slice_list[0].stop, index_map.shape[0], slice_list[0].stop): slice_list.append(slice(i, i+slice_list[0].stop)) slice_list[-1] = slice(slice_list[-1].start, index_map.shape[0]) # conns = sp.ones((0, 2), dtype=sp.uint32) logger.debug(' number of slices to process: {}'.format(len(slice_list))) for sect in slice_list: # getting coordinates of nodes and their neighbors nodes = index_map[sect] inds = sp.repeat(nodes, conn_map.shape[0], axis=0) inds += sp.tile(conn_map, (nodes.shape[0], 1)) # # calculating the flattened index of the central nodes and storing nodes = sp.ravel_multi_index(sp.hsplit(nodes, 3), data_array.shape) inds = sp.hstack([inds, sp.repeat(nodes, conn_map.shape[0], axis=0)]) # # removing neigbors with negative indicies mask = ~inds[:, 0:3] < 0 inds = inds[sp.sum(mask, axis=1) == 3] # removing neighbors with indicies outside of bounds mask = (inds[:, 0] < x_dim, inds[:, 1] < y_dim, inds[:, 2] < z_dim) mask = sp.stack(mask, axis=1) inds = inds[sp.sum(mask, axis=1) == 3] # removing indices with zero-weight connection mask = data_array[inds[:, 0], inds[:, 1], inds[:, 2]] inds = inds[mask] if inds.size: # calculating flattened index of remaining nieghbor nodes nodes = sp.ravel_multi_index(sp.hsplit(inds[:, 0:3], 3), data_array.shape) inds = sp.hstack([sp.reshape(inds[:, -1], (-1, 1)), nodes]) # ensuring conns[0] is always < conns[1] for duplicate removal mask = inds[:, 0] > inds[:, 1] inds[mask] = inds[mask][:, ::-1] # appending section connectivity data to conns array conns = sp.append(conns, inds.astype(sp.uint32), axis=0) # # using scipy magic from stackoverflow to remove dupilcate connections logger.info('removing duplicate connections...') dim0 = conns.shape[0] conns = sp.ascontiguousarray(conns) dtype = sp.dtype((sp.void, conns.dtype.itemsize*conns.shape[1])) dim1 = conns.shape[1] conns = sp.unique(conns.view(dtype)).view(conns.dtype).reshape(-1, dim1) logger.debug(' removed {} duplicates'.format(dim0 - conns.shape[0])) # return conns
def verify_mutex_exons(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_mutex_exons(event, gene, counts_segments, counts_edges, CFG) # verified = [0, 0, 0, 0] # (0) valid, (1) exon_pre_cov, (2) exon1_cov, (3) exon1_cov, (4) exon_aft_cov, # (5) exon_pre_exon1_conf, (6) exon_pre_exon2_conf, (7) exon1_exon_aft_conf, (8) exon2_exon_aft_conf info = [1, 0, 0, 0, 0, 0, 0, 0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of exon coordinates (start < stop && non-overlapping) elif sp.any(event.exons1[:, 1] - event.exons1[:, 0] < 1) or sp.any(event.exons2[:, 1] - event.exons2[:, 0] < 1) or \ (event.exons1[1, 1] > event.exons2[1, 0] and event.exons1[1, 0] < event.exons2[1, 0]) or \ (event.exons2[1, 1] > event.exons1[1, 0] and event.exons2[1, 0] < event.exons1[1, 0]): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons1[-1, 0]) & (sg.vertices[1, :] == event.exons1[-1, 1]))[0] idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon pre cov info[1] = sp.sum(counts_segments[seg_exon_pre] * seg_lens[seg_exon_pre]) / sp.sum(seg_lens[seg_exon_pre]) # exon1 cov info[2] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum(seg_lens[seg_exon1]) # exon2 cov info[3] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum(seg_lens[seg_exon2]) # exon aft cov info[4] = sp.sum(counts_segments[seg_exon_aft] * seg_lens[seg_exon_aft]) / sp.sum(seg_lens[seg_exon_aft]) ### check if coverage of first exon is >= than FACTOR times average of pre and after if info[2] >= CFG['mutex_exons']['min_skip_rel_cov'] * (info[1] + info[4])/2: verified[0] = 1 if info[3] >= CFG['mutex_exons']['min_skip_rel_cov'] * (info[1] + info[4])/2: verified[1] = 1 ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon1[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[5] = counts_edges[idx[0], 1] # exon_pre_exon2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon2[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[6] = counts_edges[idx[0], 1] # exon1_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon1[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[7] = counts_edges[idx[0], 1] # exon2_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon2[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[8] = counts_edges[idx[0], 1] # set verification flags for intron confirmation if min(info[5], info[6]) >= CFG['mutex_exons']['min_conf_count']: verified[2] = 1 if min(info[7], info[8]) >= CFG['mutex_exons']['min_conf_count']: verified[3] = 1 return (verified, info)
def verify_exon_skip(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_exon_skip(event, fn_bam, CFG) verified = [0, 0, 0, 0] # (0) valid, (1) exon_cov, (2) exon_pre_cov, (3) exon_aft_cov, # (4) exon_pre_exon_conf, (5) exon_exon_aft_conf, (6) exon_pre_exon_aft_conf info = [1, 0, 0, 0, 0, 0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = False return (verified, info) ### check validity of exon coordinates (start < stop && non-overlapping) elif sp.any(event.exons1[:, 1] - event.exons1[:, 0] < 1) or sp.any(event.exons2[:, 1] - event.exons2[:, 0] < 1): info[0] = False return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[2, 0]) & (sg.vertices[1, :] == event.exons2[2, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon = sp.sort(sp.where(segs.seg_match[idx_exon, :])[1]) seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon pre cov info[2] = sp.sum(counts_segments[seg_exon_pre] * seg_lens[seg_exon_pre]) /sp.sum(seg_lens[seg_exon_pre]) # exon aft cov info[3] = sp.sum(counts_segments[seg_exon_aft] * seg_lens[seg_exon_aft]) /sp.sum(seg_lens[seg_exon_aft]) # exon cov info[1] = sp.sum(counts_segments[seg_exon] * seg_lens[seg_exon]) /sp.sum(seg_lens[seg_exon]) ### check if coverage of skipped exon is >= than FACTOR times average of pre and after if info[1] >= CFG['exon_skip']['min_skip_rel_cov'] * (info[2] + info[3]) / 2: verified[0] = 1 ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon[0]], segs.seg_edges.shape))[0] info[4] = counts_edges[idx, 1] if info[4] >= CFG['exon_skip']['min_non_skip_count']: verified[1] = 1 # exon_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] info[5] = counts_edges[idx, 1] if info[5] >= CFG['exon_skip']['min_non_skip_count']: verified[2] = 1 # exon_pre_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] info[6] = counts_edges[idx, 1] if info[6] >= CFG['exon_skip']['min_non_skip_count']: verified[3] = 1 return (verified, info)
def quantify_alt_prime(event, gene, counts_segments, counts_edges, CFG): cov = sp.zeros((2,), dtype="float") sg = gene.splicegraph segs = gene.segmentgraph if CFG["is_matlab"]: seg_lens = segs[0, 0][1, :] - segs[0, 0][0, :] seg_shape = segs[0, 2].shape[0] idx_exon_alt1 = sp.where((sg[0, 0][0, :] == event.exon_alt1[0]) & (sg[0, 0][1, :] == event.exon_alt1[1])) idx_exon_alt2 = sp.where((sg[0, 0][0, :] == event.exon_alt2[0]) & (sg[0, 0][1, :] == event.exon_alt2[1])) idx_exon_const = sp.where((sg[0, 0][0, :] == event.exon_const[0]) & (sg[0, 0][1, :] == event.exon_const[1])) if idx_exon_alt1.shape[0] == 0: segs_exon_alt1 = sp.where( (segs[0, 0][0, :] >= event.exon_alt1[0]) & (segs[0, 0][1, :] >= event.exon_alt1[1]) ) else: segs_exon_alt1 = sp.where(segs[0, 1][idx_exon_alt1, :])[1] if idx_exon_alt2.shape[0] == 0: segs_exon_alt2 = sp.where( (segs[0, 0][0, :] >= event.exon_alt2[0]) & (segs[0, 0][1, :] >= event.exon_alt2[1]) ) else: segs_exon_alt2 = sp.where(segs[0, 1][idx_exon_alt2, :])[1] if idx_exon_const.shape[0] == 0: segs_exon_const = sp.where( (segs[0, 0][0, :] >= event.exon_const[0]) & (segs[0, 0][1, :] >= event.exon_const[1]) ) else: segs_exon_const = sp.where(segs[0, 1][idx_exon_const, :])[1] assert segs_exon_alt1.shape[0] > 0 assert segs_exon_alt2.shape[0] > 0 assert segs_exon_const.shape[0] > 0 cov[1] += sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) ### check intron confirmations as sum of valid intron scores ### intron score is the number of reads confirming this intron if max(segs_exon_alt1[-1], segs_exon_alt2[-1]) < segs_exon_const[0]: # intron1_conf idx = ( sp.where( counts_edges[:, 0] == sp.ravel_multi_index([segs_exon_alt1[0], segs_exon_const[-1]], seg_shape) )[0] + 1 ) assert idx.shape[0] > 0 cov[0] += counts_edges[idx, 1] # intron2_conf idx = ( sp.where( counts_edges[:, 0] == sp.ravel_multi_index([segs_exon_alt2[0], segs_exon_const[-1]], seg_shape) )[0] + 1 ) assert idx.shape[0] > 0 cov[1] += counts_edges[idx, 1] elif min(segs_exon_alt1[0], segs_exon_alt2[0]) > segs_exon_const[-1]: # intron1_conf idx = ( sp.where( counts_edges[:, 0] == sp.ravel_multi_index([segs_exon_const[0], segs_exon_alt1[-1]], seg_shape) )[0] + 1 ) assert idx.shape[0] > 0 cov[0] += counts_edges[idx, 1] # intron2_conf idx = ( sp.where( counts_edges[:, 0] == sp.ravel_multi_index([segs_exon_const[0], segs_exon_alt2[-1]], seg_shape) )[0] + 1 ) assert idx.shape[0] > 0 cov[1] += counts_edges[idx, 1] else: seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] ### find exons corresponding to event idx_exon11 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] if idx_exon11.shape[0] == 0: segs_exon11 = sp.where( (segs.segments[0, :] >= event.exons1[0, 0]) & (segs.segments[1, :] <= event.exons1[0, 1]) )[0] else: segs_exon11 = sp.where(segs.seg_match[idx_exon11, :])[1] idx_exon12 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] if idx_exon12.shape[0] == 0: segs_exon12 = sp.where( (segs.segments[0, :] >= event.exons1[1, 0]) & (segs.segments[1, :] <= event.exons1[1, 1]) )[0] else: segs_exon12 = sp.where(segs.seg_match[idx_exon12, :])[1] idx_exon21 = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] if idx_exon21.shape[0] == 0: segs_exon21 = sp.where( (segs.segments[0, :] >= event.exons2[0, 0]) & (segs.segments[1, :] <= event.exons2[0, 1]) )[0] else: segs_exon21 = sp.where(segs.seg_match[idx_exon21, :])[1] idx_exon22 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] if idx_exon22.shape[0] == 0: segs_exon22 = sp.where( (segs.segments[0, :] >= event.exons2[1, 0]) & (segs.segments[1, :] <= event.exons2[1, 1]) )[0] else: segs_exon22 = sp.where(segs.seg_match[idx_exon22, :] > 0)[1] assert segs_exon11.shape[0] > 0 assert segs_exon12.shape[0] > 0 assert segs_exon21.shape[0] > 0 assert segs_exon22.shape[0] > 0 if sp.all(segs_exon11 == segs_exon21): seg_diff = sp.setdiff1d(segs_exon12, segs_exon22) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon22, segs_exon12) elif sp.all(segs_exon12 == segs_exon22): seg_diff = sp.setdiff1d(segs_exon11, segs_exon21) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon21, segs_exon11) else: print >> sys.stderr, "ERROR: both exons differ in alt prime event in verify_alt_prime" sys.exit(1) # exon_diff_cov if seg_diff in segs_exon11 or seg_diff in segs_exon12: cov[0] += sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) elif seg_diff in segs_exon21 or seg_diff in segs_exon22: cov[1] += sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) else: raise Exception("differential segment not part of any other segment") ### check intron confirmations as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon11[-1], segs_exon12[0]], seg_shape))[0] assert idx.shape[0] > 0 cov[0] += counts_edges[idx, 1] # intron2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon21[-1], segs_exon22[0]], seg_shape))[0] assert idx.shape[0] > 0 cov[1] += counts_edges[idx, 1] return cov
def count_graph_coverage(genes, fn_bam=None, CFG=None, fn_out=None): # [counts] = count_graph_coverage(genes, fn_bam, CFG, fn_out) if fn_bam is None and isinstance(genes, dict): PAR = genes genes = PAR['genes'] fn_bam = PAR['fn_bam'] if 'fn_out' in PAR: fn_out = PAR['fn_out'] CFG = PAR['CFG'] if not isinstance(fn_bam, list): fn_bam = [fn_bam] counts = sp.zeros((len(fn_bam), genes.shape[0]), dtype='object') intron_tol = 0 sys.stdout.write('genes: %i\n' % genes.shape[0]) for f in range(counts.shape[0]): sys.stdout.write('\nsample %i/%i\n' % (f + 1, counts.shape[0])) ### iterate over all genes and generate counts for ### the segments in the segment graph ### and the splice junctions in the splice graph ### iterate per contig, so the bam caching works better contigs = sp.array([x.chr for x in genes]) for contig in sp.unique(contigs): contig_idx = sp.where(contigs == contig)[0] bam_cache = dict() print '\ncounting %i genes on contig %s' % (contig_idx.shape[0], contig) for ii,i in enumerate(contig_idx): sys.stdout.write('.') if ii > 0 and ii % 50 == 0: sys.stdout.write('%i/%i\n' % (ii, contig_idx.shape[0])) sys.stdout.flush() gg = genes[i] if gg.segmentgraph.is_empty(): gg.segmentgraph = Segmentgraph(gg) gg.start = gg.segmentgraph.segments.ravel().min() gg.stop = gg.segmentgraph.segments.ravel().max() counts[f, i] = Counts(gg.segmentgraph.segments.shape[1]) if CFG['bam_to_sparse'] and (fn_bam[f].endswith('npz') or os.path.exists(re.sub(r'bam$', '', fn_bam[f]) + 'npz')): ### make sure that we query the right contig from cache assert(gg.chr == contig) (tracks, intron_list) = add_reads_from_sparse_bam(gg, fn_bam[f], contig, types=['exon_track','intron_list'], filter=None, cache=bam_cache) else: ### add RNA-seq evidence to the gene structure (tracks, intron_list) = add_reads_from_bam(gg, fn_bam[f], ['exon_track','intron_list'], None, CFG['var_aware'], CFG['primary_only']); intron_list = intron_list[0] ### TODO ### extract mean exon coverage for all segments for j in range(gg.segmentgraph.segments.shape[1]): idx = sp.arange(gg.segmentgraph.segments[0, j], gg.segmentgraph.segments[1, j]) - gg.start counts[f, i].segments[j] = sp.mean(sp.sum(tracks[:, idx], axis=0)) counts[f, i].seg_pos[j] = sp.sum(sp.sum(tracks[:, idx], axis=0) > 0) k, l = sp.where(gg.segmentgraph.seg_edges == 1) ### there are no introns to count if intron_list.shape[0] == 0: for m in range(k.shape[0]): if counts[f, i].edges.shape[0] == 0: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0])) else: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0]))] continue ### extract intron counts for m in range(k.shape[0]): idx = sp.where((sp.absolute(intron_list[:, 0] - gg.segmentgraph.segments[1, k[m]]) <= intron_tol) & (sp.absolute(intron_list[:, 1] - gg.segmentgraph.segments[0, l[m]]) <= intron_tol))[0] if counts[f, i].edges.shape[0] == 0: if idx.shape[0] > 0: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), sp.sum(intron_list[idx, 2])])) else: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0])) else: if idx.shape[0] > 0: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), sp.sum(intron_list[idx, 2])]))] else: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0]))] if fn_out is not None: cPickle.dump(counts, open(fn_out, 'w'), -1) else: return counts
def verify_mutex_exons(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_mutex_exons(event, gene, counts_segments, counts_edges, CFG) # verified = [0, 0, 0, 0] # (0) valid, (1) exon_pre_cov, (2) exon1_cov, (3) exon1_cov, (4) exon_aft_cov, # (5) exon_pre_exon1_conf, (6) exon_pre_exon2_conf, (7) exon1_exon_aft_conf, (8) exon2_exon_aft_conf info = [1, 0, 0, 0, 0, 0, 0, 0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of exon coordinates (start < stop && non-overlapping) elif sp.any(event.exons1[:, 1] - event.exons1[:, 0] < 1) or sp.any(event.exons2[:, 1] - event.exons2[:, 0] < 1) or \ (event.exons1[1, 1] > event.exons2[1, 0] and event.exons1[1, 0] < event.exons2[1, 0]) or \ (event.exons2[1, 1] > event.exons1[1, 0] and event.exons2[1, 0] < event.exons1[1, 0]): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons1[-1, 0]) & (sg.vertices[1, :] == event.exons1[-1, 1]))[0] idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon pre cov info[1] = sp.sum(counts_segments[seg_exon_pre] * seg_lens[seg_exon_pre]) / sp.sum(seg_lens[seg_exon_pre]) # exon1 cov info[2] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum(seg_lens[seg_exon1]) # exon2 cov info[3] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum(seg_lens[seg_exon2]) # exon aft cov info[4] = sp.sum(counts_segments[seg_exon_aft] * seg_lens[seg_exon_aft]) / sp.sum(seg_lens[seg_exon_aft]) ### check if coverage of first exon is >= than FACTOR times average of pre and after if info[2] >= CFG['mutex_exons']['min_skip_rel_cov'] * (info[1] + info[4]) / 2: verified[0] = 1 if info[3] >= CFG['mutex_exons']['min_skip_rel_cov'] * (info[1] + info[4]) / 2: verified[1] = 1 ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon1[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[5] = counts_edges[idx[0], 1] # exon_pre_exon2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon2[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[6] = counts_edges[idx[0], 1] # exon1_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon1[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[7] = counts_edges[idx[0], 1] # exon2_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon2[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[8] = counts_edges[idx[0], 1] # set verification flags for intron confirmation if min(info[5], info[6]) >= CFG['mutex_exons']['min_conf_count']: verified[2] = 1 if min(info[7], info[8]) >= CFG['mutex_exons']['min_conf_count']: verified[3] = 1 return (verified, info)
def make_amb(Fsorg,m_up,plen,nlags,nspec=128,winname = 'boxcar'): """ Make the ambiguity function dictionary that holds the lag ambiguity and range ambiguity. Uses a sinc function weighted by a blackman window. Currently only set up for an uncoded pulse. Inputs: Fsorg: A scalar, the original sampling frequency in Hertz. m_up: The upsampled ratio between the original sampling rate and the rate of the ambiguity function up sampling. plen: The length of the pulse in samples at the original sampling frequency. nlags: The number of lags used. Outputs: Wttdict: A dictionary with the keys 'WttAll' which is the full ambiguity function for each lag, 'Wtt' is the max for each lag for plotting, 'Wrange' is the ambiguity in the range with the lag dimension summed, 'Wlag' The ambiguity for the lag, 'Delay' the numpy array for the lag sampling, 'Range' the array for the range sampling and 'WttMatrix' for a matrix that will impart the ambiguity function on a pulses. """ # make the sinc nsamps = sp.floor(8.5*m_up) nsamps = nsamps-(1-sp.mod(nsamps,2)) nvec = sp.arange(-sp.floor(nsamps/2.0),sp.floor(nsamps/2.0)+1) pos_windows = ['boxcar', 'triang', 'blackman', 'hamming', 'hann', 'bartlett', 'flattop', 'parzen', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] curwin = scisig.get_window(winname,nsamps) outsinc = curwin*sp.sinc(nvec/m_up) outsinc = outsinc/sp.sum(outsinc) dt = 1/(Fsorg*m_up) Delay = sp.arange(-(len(nvec)-1),m_up*(nlags+5))*dt t_rng = sp.arange(0,1.5*plen,dt) numdiff = len(Delay)-len(outsinc) outsincpad = sp.pad(outsinc,(0,numdiff),mode='constant',constant_values=(0.0,0.0)) (srng,d2d)=sp.meshgrid(t_rng,Delay) # envelop function envfunc = sp.zeros(d2d.shape) envfunc[(d2d-srng+plen-Delay.min()>=0)&(d2d-srng+plen-Delay.min()<=plen)]=1 envfunc = envfunc/sp.sqrt(envfunc.sum(axis=0).max()) #create the ambiguity function for everything Wtt = sp.zeros((nlags,d2d.shape[0],d2d.shape[1])) cursincrep = sp.tile(outsincpad[:,sp.newaxis],(1,d2d.shape[1])) Wt0 = Wta = cursincrep*envfunc Wt0fft = sp.fft(Wt0,axis=0) for ilag in sp.arange(nlags): cursinc = sp.roll(outsincpad,ilag*m_up) cursincrep = sp.tile(cursinc[:,sp.newaxis],(1,d2d.shape[1])) Wta = cursincrep*envfunc #do fft based convolution, probably best method given sizes Wtafft = scfft.fft(Wta,axis=0) if ilag==0: nmove = len(nvec)-1 else: nmove = len(nvec) Wtt[ilag] = sp.roll(scfft.ifft(Wtafft*sp.conj(Wt0fft),axis=0).real,nmove,axis=0) # make matrix to take # imat = sp.eye(nspec) # tau = sp.arange(-sp.floor(nspec/2.),sp.ceil(nspec/2.))/Fsorg # tauint = Delay # interpmat = spinterp.interp1d(tau,imat,bounds_error=0,axis=0)(tauint) # lagmat = sp.dot(Wtt.sum(axis=2),interpmat) # # triangle window tau = sp.arange(-sp.floor(nspec/2.),sp.ceil(nspec/2.))/Fsorg amb1d = plen-tau amb1d[amb1d<0]=0. amb1d[tau<0]=0. amb1d=amb1d/plen kp = sp.argwhere(amb1d>0).flatten() lagmat = sp.zeros((Wtt.shape[0],nspec)) lagmat.flat[sp.ravel_multi_index((sp.arange(Wtt.shape[0]),kp),lagmat.shape)]=amb1d[kp] Wttdict = {'WttAll':Wtt,'Wtt':Wtt.max(axis=0),'Wrange':Wtt.sum(axis=1),'Wlag':Wtt.sum(axis=2), 'Delay':Delay,'Range':v_C_0*t_rng/2.0,'WttMatrix':lagmat} return Wttdict
def generate_node_connectivity_array(index_map, data_array): r""" Generates a node connectivity array based on faces, edges and corner adjacency """ # logger.info('generating network connections...') # # setting up some constants x_dim, y_dim, z_dim = data_array.shape conn_map = list(product([0, -1, 1], [0, -1, 1], [0, -1, 1])) # conn_map = sp.array(conn_map, dtype=int) conn_map = conn_map[1:] # # creating slice list to process data chunks slice_list = [slice(0, 10000)] for i in range(slice_list[0].stop, index_map.shape[0], slice_list[0].stop): slice_list.append(slice(i, i + slice_list[0].stop)) slice_list[-1] = slice(slice_list[-1].start, index_map.shape[0]) # conns = sp.ones((0, 2), dtype=data_array.index_int_type) logger.debug('\tnumber of slices to process: {}'.format(len(slice_list))) percent = 10 for n, sect in enumerate(slice_list): # getting coordinates of nodes and their neighbors nodes = index_map[sect] inds = sp.repeat(nodes, conn_map.shape[0], axis=0) inds += sp.tile(conn_map, (nodes.shape[0], 1)) # # calculating the flattened index of the central nodes and storing nodes = sp.ravel_multi_index(sp.hsplit(nodes, 3), data_array.shape) inds = sp.hstack([inds, sp.repeat(nodes, conn_map.shape[0], axis=0)]) # # removing neigbors with negative indicies mask = ~inds[:, 0:3] < 0 inds = inds[sp.sum(mask, axis=1) == 3] # removing neighbors with indicies outside of bounds mask = (inds[:, 0] < x_dim, inds[:, 1] < y_dim, inds[:, 2] < z_dim) mask = sp.stack(mask, axis=1) inds = inds[sp.sum(mask, axis=1) == 3] # removing indices with zero-weight connection mask = data_array[inds[:, 0], inds[:, 1], inds[:, 2]] inds = inds[mask] if inds.size: # calculating flattened index of remaining nieghbor nodes nodes = sp.ravel_multi_index(sp.hsplit(inds[:, 0:3], 3), data_array.shape) inds = sp.hstack([sp.reshape(inds[:, -1], (-1, 1)), nodes]) # ensuring conns[0] is always < conns[1] for duplicate removal mask = inds[:, 0] > inds[:, 1] inds[mask] = inds[mask][:, ::-1] # appending section connectivity data to conns array conns = sp.append(conns, inds.astype(sp.uint32), axis=0) if int(n / len(slice_list) * 100) == percent: logger.debug('\tprocessed slice {:5d}, {}% complete'.format( n, percent)) percent += 10 # # using scipy magic from stackoverflow to remove dupilcate connections logger.info('removing duplicate connections...') dim0 = conns.shape[0] conns = sp.ascontiguousarray(conns) dtype = sp.dtype((sp.void, conns.dtype.itemsize * conns.shape[1])) dim1 = conns.shape[1] conns = sp.unique(conns.view(dtype)).view(conns.dtype).reshape(-1, dim1) logger.debug('\tremoved {} duplicates'.format(dim0 - conns.shape[0])) # return conns
def quantify_alt_prime(event, gene, counts_segments, counts_edges): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] ### find exons corresponding to event idx_exon11 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] if idx_exon11.shape[0] == 0: segs_exon11 = sp.where((segs.segments[0, :] >= event.exons1[0, 0]) & (segs.segments[1, :] <= event.exons1[0, 1]))[0] else: segs_exon11 = sp.where(segs.seg_match[idx_exon11, :])[1] idx_exon12 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] if idx_exon12.shape[0] == 0: segs_exon12 = sp.where((segs.segments[0, :] >= event.exons1[1, 0]) & (segs.segments[1, :] <= event.exons1[1, 1]))[0] else: segs_exon12 = sp.where(segs.seg_match[idx_exon12, :])[1] idx_exon21 = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] if idx_exon21.shape[0] == 0: segs_exon21 = sp.where((segs.segments[0, :] >= event.exons2[0, 0]) & (segs.segments[1, :] <= event.exons2[0, 1]))[0] else: segs_exon21 = sp.where(segs.seg_match[idx_exon21, :])[1] idx_exon22 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] if idx_exon22.shape[0] == 0: segs_exon22 = sp.where((segs.segments[0, :] >= event.exons2[1, 0]) & (segs.segments[1, :] <= event.exons2[1, 1]))[0] else: segs_exon22 = sp.where(segs.seg_match[idx_exon22, :] > 0)[1] assert(segs_exon11.shape[0] > 0) assert(segs_exon12.shape[0] > 0) assert(segs_exon21.shape[0] > 0) assert(segs_exon22.shape[0] > 0) if sp.all(segs_exon11 == segs_exon21): seg_diff = sp.setdiff1d(segs_exon12, segs_exon22) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon22, segs_exon12) elif sp.all(segs_exon12 == segs_exon22): seg_diff = sp.setdiff1d(segs_exon11, segs_exon21) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon21, segs_exon11) else: print("ERROR: both exons differ in alt prime event in verify_alt_prime", file=sys.stderr) sys.exit(1) # exon_diff_cov if seg_diff in segs_exon11 or seg_diff in segs_exon12: cov[0] += sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) elif seg_diff in segs_exon21 or seg_diff in segs_exon22: cov[1] += sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) else: raise Exception('differential segment not part of any other segment') ### check intron confirmations as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon11[-1], segs_exon12[0]], seg_shape))[0] assert(idx.shape[0] > 0) cov[0] += counts_edges[idx, 1] # intron2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon21[-1], segs_exon22[0]], seg_shape))[0] assert(idx.shape[0] > 0) cov[1] += counts_edges[idx, 1] return cov
def count_graph_coverage(genes, fn_bam=None, CFG=None, fn_out=None): # [counts] = count_graph_coverage(genes, fn_bam, CFG, fn_out) if fn_bam is None and isinstance(genes, dict): PAR = genes genes = PAR['genes'] fn_bam = PAR['fn_bam'] if 'fn_out' in PAR: fn_out = PAR['fn_out'] CFG = PAR['CFG'] if not isinstance(fn_bam, list): fn_bam = [fn_bam] counts = sp.zeros((len(fn_bam), genes.shape[0]), dtype='object') intron_tol = 0 sys.stdout.write('genes: %i\n' % genes.shape[0]) for f in range(counts.shape[0]): sys.stdout.write('sample %i/%i\n' % (f + 1, counts.shape[0])) bam_cache = None ### iterate over all genes and generate counts for ### the segments in the segment graph ### and the splice junctions in the splice graph for i in range(genes.shape[0]): sys.stdout.write('.') if i > 0 and i % 50 == 0: sys.stdout.write('%i\n' % i) gg = genes[i] if gg.segmentgraph.is_empty(): gg.segmentgraph = Segmentgraph(gg) gg.start = gg.segmentgraph.segments.ravel().min() gg.stop = gg.segmentgraph.segments.ravel().max() counts[f, i] = Counts(gg.segmentgraph.segments.shape[1]) if CFG['bam_to_sparse'] and (fn_bam[f].endswith('npz') or os.path.exists(re.sub(r'bam$', '', fn_bam[f]) + 'npz')): ### load counts from summary file if bam_cache is None: bam_cache = dict() if fn_bam[f].endswith('npz'): tmp = sp.load(fn_bam[f]) else: tmp = sp.load(re.sub(r'bam$', '', fn_bam[f]) + 'npz') ### re-built sparse matrix for c in sp.unique([re.sub(r'_reads_dat$', '', x) for x in tmp if x.endswith('_reads_dat')]): bam_cache[c + '_reads'] = scipy.sparse.coo_matrix((tmp[c + '_reads_dat'], (tmp[c + '_reads_row'], tmp[c + '_reads_col'])), shape=tmp[c + '_reads_shp'], dtype='uint32').tocsc() bam_cache[c + '_introns_m'] = tmp[c + '_introns_m'] bam_cache[c + '_introns_p'] = tmp[c + '_introns_p'] del tmp if bam_cache[gg.chr + '_reads'].shape[0] == 0: tracks = sp.zeros((1, gg.stop - gg.start), dtype='int') elif bam_cache[gg.chr + '_reads'].shape[0] > 1: tracks = bam_cache[gg.chr + '_reads'][[0, 1 + int(gg.strand == '-')], gg.start:gg.stop].todense() else: tracks = bam_cache[gg.chr + '_reads'][:, gg.start:gg.stop].todense() if bam_cache[c + '_introns_m'].shape[0] > 0: if gg.strand == '-': intron_list = get_intron_range(bam_cache[gg.chr + '_introns_m'], gg.start, gg.stop) else: intron_list = get_intron_range(bam_cache[gg.chr + '_introns_p'], gg.start, gg.stop) else: intron_list = get_intron_range(bam_cache[gg.chr + '_introns_p'], gg.start, gg.stop) else: ### add RNA-seq evidence to the gene structure #(tracks, intron_list) = add_reads_from_bam(gg, fn_bam[f], ['exon_track','intron_list'], CFG['read_filter'], CFG['var_aware'], CFG['primary_only']); (tracks, intron_list) = add_reads_from_bam(gg, fn_bam[f], ['exon_track','intron_list'], None, CFG['var_aware'], CFG['primary_only']); intron_list = intron_list[0] ### TODO ### extract mean exon coverage for all segments for j in range(gg.segmentgraph.segments.shape[1]): idx = sp.arange(gg.segmentgraph.segments[0, j], gg.segmentgraph.segments[1, j]) - gg.start counts[f, i].segments[j] = sp.mean(sp.sum(tracks[:, idx], axis=0)) counts[f, i].seg_pos[j] = sp.sum(sp.sum(tracks[:, idx], axis=0) > 0) k, l = sp.where(gg.segmentgraph.seg_edges == 1) ### there are no introns to count if intron_list.shape[0] == 0: for m in range(k.shape[0]): if counts[f, i].edges.shape[0] == 0: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0])) else: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0]))] continue ### extract intron counts for m in range(k.shape[0]): idx = sp.where((sp.absolute(intron_list[:, 0] - gg.segmentgraph.segments[1, k[m]]) <= intron_tol) & (sp.absolute(intron_list[:, 1] - gg.segmentgraph.segments[0, l[m]]) <= intron_tol))[0] if counts[f, i].edges.shape[0] == 0: if idx.shape[0] > 0: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), sp.sum(intron_list[idx, 2])])) else: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0])) else: if idx.shape[0] > 0: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), sp.sum(intron_list[idx, 2])]))] else: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0]))] if fn_out is not None: cPickle.dump(counts, open(fn_out, 'w'), -1) else: return counts
def count_graph_coverage(genes, fn_bam=None, options=None, fn_out=None): if fn_bam is None and isinstance(genes, dict): PAR = genes genes = PAR['genes'] fn_bam = PAR['fn_bam'] if 'fn_out' in PAR: fn_out = PAR['fn_out'] options = PAR['options'] if hasattr(genes[0], 'splicegraph_edges_data'): for gg in genes: gg.from_sparse() if not isinstance(fn_bam, list): fn_bam = [fn_bam] counts = sp.zeros((len(fn_bam), genes.shape[0]), dtype='object') intron_tol = 0 sys.stdout.write('genes: %i\n' % genes.shape[0]) for f in range(counts.shape[0]): sys.stdout.write('\nsample %i/%i\n' % (f + 1, counts.shape[0])) ### iterate over all genes and generate counts for ### the segments in the segment graph ### and the splice junctions in the splice graph ### iterate per contig, so the bam caching works better contigs = sp.array([x.chr for x in genes]) for contig in sp.unique(contigs): contig_idx = sp.where(contigs == contig)[0] bam_cache = dict() print('\ncounting %i genes on contig %s' % (contig_idx.shape[0], contig)) for ii,i in enumerate(contig_idx): sys.stdout.write('.') if ii > 0 and ii % 50 == 0: sys.stdout.write('%i/%i\n' % (ii, contig_idx.shape[0])) sys.stdout.flush() gg = genes[i] if gg.segmentgraph.is_empty(): gg.segmentgraph = Segmentgraph(gg) gg.start = gg.segmentgraph.segments.ravel().min() gg.stop = gg.segmentgraph.segments.ravel().max() counts[f, i] = Counts(gg.segmentgraph.segments.shape[1]) if options.sparse_bam and \ (fn_bam[f].endswith('npz') or \ os.path.exists(re.sub(r'bam$', '', fn_bam[f]) + 'npz') or \ fn_bam[f].endswith('hdf5') or \ os.path.exists(re.sub(r'bam$', '', fn_bam[f]) + 'hdf5')): ### make sure that we query the right contig from cache assert(gg.chr == contig) (tracks, intron_list) = add_reads_from_sparse_bam(gg, fn_bam[f], contig, options.confidence, types=['exon_track','intron_list'], filter=None, cache=bam_cache) else: ### add RNA-seq evidence to the gene structure (tracks, intron_list) = add_reads_from_bam(gg, fn_bam[f], ['exon_track','intron_list'], None, options.var_aware, options.primary_only, mm_tag=options.mm_tag); intron_list = intron_list[0] ### TODO ### extract mean exon coverage for all segments for j in range(gg.segmentgraph.segments.shape[1]): idx = sp.arange(gg.segmentgraph.segments[0, j], gg.segmentgraph.segments[1, j]) - gg.start counts[f, i].segments[j] = sp.mean(sp.sum(tracks[:, idx], axis=0)) counts[f, i].seg_pos[j] = sp.sum(sp.sum(tracks[:, idx], axis=0) > 0) k, l = sp.where(gg.segmentgraph.seg_edges == 1) ### there are no introns to count if intron_list.shape[0] == 0: for m in range(k.shape[0]): if counts[f, i].edges.shape[0] == 0: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0])) else: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0]))] continue ### extract intron counts for m in range(k.shape[0]): idx = sp.where((sp.absolute(intron_list[:, 0] - gg.segmentgraph.segments[1, k[m]]) <= intron_tol) & (sp.absolute(intron_list[:, 1] - gg.segmentgraph.segments[0, l[m]]) <= intron_tol))[0] if counts[f, i].edges.shape[0] == 0: if idx.shape[0] > 0: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), sp.sum(intron_list[idx, 2])])) else: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0])) else: if idx.shape[0] > 0: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), sp.sum(intron_list[idx, 2])]))] else: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0]))] if fn_out is not None: pickle.dump(counts, open(fn_out, 'wb'), -1) else: return counts
def quantify_mult_exon_skip(event, gene, counts_segments, counts_edges, CFG): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph if CFG['is_matlab']: seg_lens = segs[0, 0][1, :] - segs[0, 0][0, :] seg_shape = segs[0, 2].shape[0] order = 'F' offset = 1 ### find exons corresponding to event idx_exon_pre = sp.where((sg[0, 0][0, :] == event.exon_pre[0]) & (sg[0, 0][1, :] == event.exon_pre[1]))[0] idx_exon_aft = sp.where((sg[0, 0][0, :] == event.exon_aft[0]) & (sg[0, 0][1, :] == event.exon_aft[1]))[0] seg_exons = [] for i in range(0, event.exons.shape[1], 2): tmp = sp.where((sg[0, 0][0, :] == event.exons[i]) & (sg[0, 0][1, :] == event.exons[i + 1]))[0] seg_exons.append(sp.where(segs[0, 1][tmp, :])[1]) ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs[0, 1][idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs[0, 1][idx_exon_aft, :])[1]) else: seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[-1, 0]) & (sg.vertices[1, :] == event.exons2[-1, 1]))[0] seg_exons = [] for i in range(1, event.exons2.shape[0] - 1): tmp = sp.where((sg.vertices[0, :] == event.exons2[i, 0]) & (sg.vertices[1, :] == event.exons2[i, 1]))[0] seg_exons.append(sp.where(segs.seg_match[tmp, :])[1]) ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exons_u = sp.sort( sp.unique([x for sublist in seg_exons for x in sublist])) ### inner exons_cov cov[0] = sp.sum(counts_segments[seg_exons_u] * seg_lens[seg_exons_u]) / sp.sum(seg_lens[seg_exons_u]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exons[0][0]], seg_shape, order=order) + offset)[0] if len(idx1.shape) > 0 and idx1.shape[0] > 0: cov[0] += counts_edges[idx1[0], 1] # exon_exon_aft_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exons[-1][-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx2.shape) > 0 and idx2.shape[0] > 0: cov[0] += counts_edges[idx2[0], 1] # exon_pre_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx3.shape) > 0 and idx3.shape[0] > 0: cov[1] = counts_edges[idx3[0], 1] for i in range(len(seg_exons) - 1): # sum_inner_exon_conf idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exons[i][-1], seg_exons[i + 1][0]], seg_shape, order=order) + offset)[0] if len(idx4.shape) > 0 and idx4.shape[0] > 0: cov[0] += counts_edges[idx4[0], 1] return cov
def quantify_exon_skip(event, gene, counts_segments, counts_edges, CFG): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph if CFG['is_matlab']: seg_lens = segs[0, 0][1, :] - segs[0, 0][0, :] seg_shape = segs[0, 2].shape order = 'F' offset = 1 ### find exons corresponding to event idx_exon_pre = sp.where((sg[0, 0][0, :] == event.exon_pre[0]) & (sg[0, 0][1, :] == event.exon_pre[1]))[0] idx_exon = sp.where((sg[0, 0][0, :] == event.exon[0]) & (sg[0, 0][1, :] == event.exon[1]))[0] idx_exon_aft = sp.where((sg[0, 0][0, :] == event.exon_aft[0]) & (sg[0, 0][1, :] == event.exon_aft[1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs[0, 1][idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs[0, 1][idx_exon_aft, :])[1]) seg_exon = sp.sort(sp.where(segs[0, 1][idx_exon, :])[1]) else: seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[2, 0]) & (sg.vertices[1, :] == event.exons2[2, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon = sp.sort(sp.where(segs.seg_match[idx_exon, :])[1]) # get inner exon cov cov[0] = sp.sum(counts_segments[seg_exon] * seg_lens[seg_exon]) / sp.sum( seg_lens[seg_exon]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon[0]], seg_shape, order=order) + offset)[0] cov[0] += counts_edges[idx1, 1] # exon_exon_aft_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] cov[0] += counts_edges[idx2, 1] # exon_pre_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] cov[1] = counts_edges[idx3, 1] return cov
def quantify_alt_prime(event, gene, counts_segments, counts_edges, CFG): cov = sp.zeros((2, ), dtype='float') sg = gene.splicegraph segs = gene.segmentgraph if CFG['is_matlab']: seg_lens = segs[0, 0][1, :] - segs[0, 0][0, :] seg_shape = segs[0, 2].shape[0] idx_exon_alt1 = sp.where((sg[0, 0][0, :] == event.exon_alt1[0]) & (sg[0, 0][1, :] == event.exon_alt1[1])) idx_exon_alt2 = sp.where((sg[0, 0][0, :] == event.exon_alt2[0]) & (sg[0, 0][1, :] == event.exon_alt2[1])) idx_exon_const = sp.where((sg[0, 0][0, :] == event.exon_const[0]) & (sg[0, 0][1, :] == event.exon_const[1])) if idx_exon_alt1.shape[0] == 0: segs_exon_alt1 = sp.where((segs[0, 0][0, :] >= event.exon_alt1[0]) & (segs[0, 0][1, :] >= event.exon_alt1[1])) else: segs_exon_alt1 = sp.where(segs[0, 1][idx_exon_alt1, :])[1] if idx_exon_alt2.shape[0] == 0: segs_exon_alt2 = sp.where((segs[0, 0][0, :] >= event.exon_alt2[0]) & (segs[0, 0][1, :] >= event.exon_alt2[1])) else: segs_exon_alt2 = sp.where(segs[0, 1][idx_exon_alt2, :])[1] if idx_exon_const.shape[0] == 0: segs_exon_const = sp.where( (segs[0, 0][0, :] >= event.exon_const[0]) & (segs[0, 0][1, :] >= event.exon_const[1])) else: segs_exon_const = sp.where(segs[0, 1][idx_exon_const, :])[1] assert (segs_exon_alt1.shape[0] > 0) assert (segs_exon_alt2.shape[0] > 0) assert (segs_exon_const.shape[0] > 0) cov[1] += sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) ### check intron confirmations as sum of valid intron scores ### intron score is the number of reads confirming this intron if max(segs_exon_alt1[-1], segs_exon_alt2[-1]) < segs_exon_const[0]: # intron1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon_alt1[0], segs_exon_const[-1]], seg_shape))[0] + 1 assert (idx.shape[0] > 0) cov[0] += counts_edges[idx, 1] # intron2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon_alt2[0], segs_exon_const[-1]], seg_shape))[0] + 1 assert (idx.shape[0] > 0) cov[1] += counts_edges[idx, 1] elif min(segs_exon_alt1[0], segs_exon_alt2[0]) > segs_exon_const[-1]: # intron1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon_const[0], segs_exon_alt1[-1]], seg_shape))[0] + 1 assert (idx.shape[0] > 0) cov[0] += counts_edges[idx, 1] # intron2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon_const[0], segs_exon_alt2[-1]], seg_shape))[0] + 1 assert (idx.shape[0] > 0) cov[1] += counts_edges[idx, 1] else: seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] ### find exons corresponding to event idx_exon11 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] if idx_exon11.shape[0] == 0: segs_exon11 = sp.where( (segs.segments[0, :] >= event.exons1[0, 0]) & (segs.segments[1, :] <= event.exons1[0, 1]))[0] else: segs_exon11 = sp.where(segs.seg_match[idx_exon11, :])[1] idx_exon12 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] if idx_exon12.shape[0] == 0: segs_exon12 = sp.where( (segs.segments[0, :] >= event.exons1[1, 0]) & (segs.segments[1, :] <= event.exons1[1, 1]))[0] else: segs_exon12 = sp.where(segs.seg_match[idx_exon12, :])[1] idx_exon21 = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] if idx_exon21.shape[0] == 0: segs_exon21 = sp.where( (segs.segments[0, :] >= event.exons2[0, 0]) & (segs.segments[1, :] <= event.exons2[0, 1]))[0] else: segs_exon21 = sp.where(segs.seg_match[idx_exon21, :])[1] idx_exon22 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] if idx_exon22.shape[0] == 0: segs_exon22 = sp.where( (segs.segments[0, :] >= event.exons2[1, 0]) & (segs.segments[1, :] <= event.exons2[1, 1]))[0] else: segs_exon22 = sp.where(segs.seg_match[idx_exon22, :] > 0)[1] assert (segs_exon11.shape[0] > 0) assert (segs_exon12.shape[0] > 0) assert (segs_exon21.shape[0] > 0) assert (segs_exon22.shape[0] > 0) if sp.all(segs_exon11 == segs_exon21): seg_diff = sp.setdiff1d(segs_exon12, segs_exon22) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon22, segs_exon12) elif sp.all(segs_exon12 == segs_exon22): seg_diff = sp.setdiff1d(segs_exon11, segs_exon21) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon21, segs_exon11) else: print >> sys.stderr, "ERROR: both exons differ in alt prime event in verify_alt_prime" sys.exit(1) # exon_diff_cov if seg_diff in segs_exon11 or seg_diff in segs_exon12: cov[0] += sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) elif seg_diff in segs_exon21 or seg_diff in segs_exon22: cov[1] += sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) else: raise Exception( 'differential segment not part of any other segment') ### check intron confirmations as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon11[-1], segs_exon12[0]], seg_shape))[0] assert (idx.shape[0] > 0) cov[0] += counts_edges[idx, 1] # intron2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon21[-1], segs_exon22[0]], seg_shape))[0] assert (idx.shape[0] > 0) cov[1] += counts_edges[idx, 1] return cov
def quantify_mutex_exons(event, gene, counts_segments, counts_edges, CFG): sg = gene.splicegraph segs = gene.segmentgraph if CFG['is_matlab']: seg_lens = segs[0, 0][1, :] - segs[0, 0][0, :] seg_shape = segs[0, 2].shape[0] order = 'F' offset = 1 ### find exons corresponding to event idx_exon_pre = sp.where((sg[0, 0][0, :] == event.exon_pre[0]) & (sg[0, 0][1, :] == event.exon_pre[1]))[0] idx_exon_aft = sp.where((sg[0, 0][0, :] == event.exon_aft[0]) & (sg[0, 0][1, :] == event.exon_aft[1]))[0] idx_exon1 = sp.where((sg[0, 0][0, :] == event.exon1[0]) & (sg[0, 0][1, :] == event.exon1[1]))[0] idx_exon2 = sp.where((sg[0, 0][0, :] == event.exon2[0]) & (sg[0, 0][1, :] == event.exon2[1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs[0, 1][idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs[0, 1][idx_exon_aft, :])[1]) seg_exon1 = sp.sort(sp.where(segs[0, 1][idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs[0, 1][idx_exon2, :])[1]) else: seg_lens = segs.segments[1, :] - segs.segments[0, :] seg_shape = segs.seg_edges.shape[0] order = 'C' offset = 0 ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons1[-1, 0]) & (sg.vertices[1, :] == event.exons1[-1, 1]))[0] idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) # exon1 cov cov[0] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum( seg_lens[seg_exon1]) # exon2 cov cov[1] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum( seg_lens[seg_exon2]) ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon1_conf idx1 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon1[0]], seg_shape, order=order) + offset)[0] if len(idx1.shape) > 0 and idx1.shape[0] > 0: cov[0] += counts_edges[idx1[0], 1] # exon_pre_exon2_conf idx2 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon2[0]], seg_shape, order=order) + offset)[0] if len(idx2.shape) > 0 and idx2.shape[0] > 0: cov[1] += counts_edges[idx2[0], 1] # exon1_exon_aft_conf idx3 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon1[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx3.shape) > 0 and idx3.shape[0] > 0: cov[0] += counts_edges[idx3[0], 1] # exon2_exon_aft_conf idx4 = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon2[-1], seg_exon_aft[0]], seg_shape, order=order) + offset)[0] if len(idx4.shape) > 0 and idx4.shape[0] > 0: cov[1] += counts_edges[idx4[0], 1] return cov
def verify_mult_exon_skip(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_mult_exon_skip(event, gene, counts_segments, counts_edges, CFG) verified = [0, 0, 0, 0, 0] info = [1, 0, 0, 0, 0, 0, 0, 0, 0] # (0) valid, (1) exon_pre_cov, (2) exons_cov, (3) exon_aft_cov # (4) exon_pre_exon_conf, (5) exon_exon_aft_conf, (6) exon_pre_exon_aft_conf # (7) sum_inner_exon_conf, (8) num_inner_exon ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of exon coordinates (start < stop && non-overlapping) elif sp.any(event.exons1[:, 1] - event.exons1[:, 0] < 1) or sp.any(event.exons2[:, 1] - event.exons2[:, 0] < 1): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[-1, 0]) & (sg.vertices[1, :] == event.exons2[-1, 1]))[0] seg_exons = [] for i in range(1, event.exons2.shape[0] - 1): tmp = sp.where((sg.vertices[0, :] == event.exons2[i, 0]) & (sg.vertices[1, :] == event.exons2[i, 1]))[0] seg_exons.append(sp.where(segs.seg_match[tmp, :])[1]) ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exons_u = sp.sort(sp.unique([x for sublist in seg_exons for x in sublist])) seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon_pre_cov info[1] = sp.sum(counts_segments[seg_exon_pre] * seg_lens[seg_exon_pre]) / sp.sum(seg_lens[seg_exon_pre]) # exon_aft_cov info[3] = sp.sum(counts_segments[seg_exon_aft] * seg_lens[seg_exon_aft]) / sp.sum(seg_lens[seg_exon_aft]) # exons_cov info[2] = sp.sum(counts_segments[seg_exons_u] * seg_lens[seg_exons_u]) / sp.sum(seg_lens[seg_exons_u]) ### check if coverage of skipped exon is >= than FACTOR times average of pre and after if info[2] >= CFG['mult_exon_skip']['min_skip_rel_cov'] * (info[1] + info[3]) / 2: verified[0] = 1 ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exons[0][0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[4] = counts_edges[idx[0], 1] # exon_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exons[-1][-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[5] = counts_edges[idx[0], 1] # exon_pre_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon_pre[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[6] = counts_edges[idx[0], 1] for i in range(len(seg_exons) - 1): # sum_inner_exon_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exons[i][-1], seg_exons[i+1][0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[7] += counts_edges[idx[0], 1] # num_inner_exon info[8] = event.exons2.shape[0] - 2 if info[4] >= CFG['mult_exon_skip']['min_non_skip_count']: verified[1] = 1 if info[5] >= CFG['mult_exon_skip']['min_non_skip_count']: verified[2] = 1 if (info[7] / info[8]) >= CFG['mult_exon_skip']['min_non_skip_count']: verified[3] = 1 if info[6] >= CFG['mult_exon_skip']['min_skip_count']: verified[4] = 1 return (verified, info)
def verify_intron_retention(event, gene, counts_segments, counts_edges, counts_seg_pos, CFG): # [verified, info] = verify_intron_retention(event, fn_bam, CFG) verified = [0, 0] # (0) valid, (1) intron_cov, (2) exon1_cov, (3), exon2_cov # (4) intron_conf, (5) intron_cov_region info = [1, 0, 0, 0, 0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of exon coordinates (start < stop && non-overlapping) elif sp.any(event.exons1[:, 1] - event.exons1[:, 0] < 1) or sp.any( (event.exons2[1] - event.exons2[0]) < 1): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] ### find segments corresponding to exons seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) seg_all = sp.arange(seg_exon1[0], seg_exon2[-1]) seg_intron = sp.setdiff1d(seg_all, seg_exon1) seg_intron = sp.setdiff1d(seg_intron, seg_exon2) assert (seg_intron.shape[0] > 0) seg_lens = segs.segments[1, :] - segs.segments[0, :] ### compute exon coverages as mean of position wise coverage # exon1_cov info[2] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum(seg_lens[seg_exon1]) # exon2_cov info[3] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum(seg_lens[seg_exon2]) # intron_cov info[1] = sp.sum(counts_segments[seg_intron] * seg_lens[seg_intron]) / sp.sum(seg_lens[seg_intron]) # intron_cov_region info[5] = sp.sum(counts_seg_pos[seg_intron]) / sp.sum(seg_lens[seg_intron]) ### check if counts match verification criteria if info[1] > CFG['intron_retention']['min_retention_cov'] and \ info[5] > CFG['intron_retention']['min_retention_region'] and \ info[1] >= CFG['intron_retention']['min_retention_rel_cov'] * (info[2] + info[3]) / 2: verified[0] = 1 ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon1[-1], seg_exon2[0]], segs.seg_edges.shape))[0] info[4] = counts_edges[idx, 1] if info[4] >= CFG['intron_retention']['min_non_retention_count']: verified[1] = 1 return (verified, info)
def verify_alt_prime(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_exon_skip(event, fn_bam, cfg) # (0) valid, (1) exon_diff_cov, (2) exon_const_cov # (3) intron1_conf, (4) intron2_conf info = [1, 0, 0, 0, 0] verified = [0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of intron coordinates (only one side is differing) if (event.exons1[0, 1] != event.exons2[0, 1]) and (event.exons1[1, 0] != event.exons2[1, 0]): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon11 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] if idx_exon11.shape[0] == 0: segs_exon11 = sp.where((segs.segments[0, :] >= event.exons1[0, 0]) & (segs.segments[1, :] <= event.exons1[0, 1]))[0] else: segs_exon11 = sp.where(segs.seg_match[idx_exon11, :])[1] idx_exon12 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] if idx_exon12.shape[0] == 0: segs_exon12 = sp.where((segs.segments[0, :] >= event.exons1[1, 0]) & (segs.segments[1, :] <= event.exons1[1, 1]))[0] else: segs_exon12 = sp.where(segs.seg_match[idx_exon12, :])[1] idx_exon21 = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] if idx_exon21.shape[0] == 0: segs_exon21 = sp.where((segs.segments[0, :] >= event.exons2[0, 0]) & (segs.segments[1, :] <= event.exons2[0, 1]))[0] else: segs_exon21 = sp.where(segs.seg_match[idx_exon21, :])[1] idx_exon22 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] if idx_exon22.shape[0] == 0: segs_exon22 = sp.where((segs.segments[0, :] >= event.exons2[1, 0]) & (segs.segments[1, :] <= event.exons2[1, 1]))[0] else: segs_exon22 = sp.where(segs.seg_match[idx_exon22, :] > 0)[1] assert(segs_exon11.shape[0] > 0) assert(segs_exon12.shape[0] > 0) assert(segs_exon21.shape[0] > 0) assert(segs_exon22.shape[0] > 0) if sp.all(segs_exon11 == segs_exon21): seg_exon_const = segs_exon11 seg_diff = sp.setdiff1d(segs_exon12, segs_exon22) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon22, segs_exon12) seg_const = sp.intersect1d(segs_exon12, segs_exon22) elif sp.all(segs_exon12 == segs_exon22): seg_exon_const = segs_exon12 seg_diff = sp.setdiff1d(segs_exon11, segs_exon21) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon21, segs_exon11) seg_const = sp.intersect1d(segs_exon21, segs_exon11) else: print >> sys.stderr, "ERROR: both exons differ in alt prime event in verify_alt_prime" sys.exit(1) seg_const = sp.r_[seg_exon_const, seg_const] seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon_diff_cov info[1] = sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) # exon_const_cov info[2] = sp.sum(counts_segments[seg_const] * seg_lens[seg_const]) / sp.sum(seg_lens[seg_const]) if info[1] >= CFG['alt_prime']['min_diff_rel_cov'] * info[2]: verified[0] = 1 ### check intron confirmations as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon11[-1], segs_exon12[0]], segs.seg_edges.shape))[0] assert(idx.shape[0] > 0) info[3] = counts_edges[idx, 1] # intron2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon21[-1], segs_exon22[0]], segs.seg_edges.shape))[0] assert(idx.shape[0] > 0) info[4] = counts_edges[idx, 1] if min(info[3], info[4]) >= CFG['alt_prime']['min_intron_count']: verified[1] = 1 return (verified, info)
def verify_mult_exon_skip(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_mult_exon_skip(event, gene, counts_segments, counts_edges, CFG) verified = [0, 0, 0, 0, 0] info = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] # (0) valid, (1) exon_pre_cov, (2) exons_cov, (3) exon_aft_cov # (4) exon_pre_exon_conf, (5) exon_exon_aft_conf, (6) exon_pre_exon_aft_conf # (7) sum_inner_exon_conf, (8) num_inner_exon, (9) len_inner_exon ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of exon coordinates (start < stop && non-overlapping) elif sp.any(event.exons1[:, 1] - event.exons1[:, 0] < 1) or sp.any( event.exons2[:, 1] - event.exons2[:, 0] < 1): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon_pre = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] idx_exon_aft = sp.where((sg.vertices[0, :] == event.exons2[-1, 0]) & (sg.vertices[1, :] == event.exons2[-1, 1]))[0] seg_exons = [] for i in range(1, event.exons2.shape[0] - 1): tmp = sp.where((sg.vertices[0, :] == event.exons2[i, 0]) & (sg.vertices[1, :] == event.exons2[i, 1]))[0] seg_exons.append(sp.where(segs.seg_match[tmp, :])[1]) ### find segments corresponding to exons seg_exon_pre = sp.sort(sp.where(segs.seg_match[idx_exon_pre, :])[1]) seg_exon_aft = sp.sort(sp.where(segs.seg_match[idx_exon_aft, :])[1]) seg_exons_u = sp.sort( sp.unique([x for sublist in seg_exons for x in sublist])) seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon_pre_cov info[1] = sp.sum(counts_segments[seg_exon_pre] * seg_lens[seg_exon_pre]) / sp.sum(seg_lens[seg_exon_pre]) # exon_aft_cov info[3] = sp.sum(counts_segments[seg_exon_aft] * seg_lens[seg_exon_aft]) / sp.sum(seg_lens[seg_exon_aft]) # exons_cov info[2] = sp.sum(counts_segments[seg_exons_u] * seg_lens[seg_exons_u]) / sp.sum(seg_lens[seg_exons_u]) ### check if coverage of skipped exon is >= than FACTOR times average of pre and after if info[2] >= CFG['mult_exon_skip']['min_skip_rel_cov'] * (info[1] + info[3]) / 2: verified[0] = 1 ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # exon_pre_exon_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exons[0][0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[4] = counts_edges[idx[0], 1] # exon_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exons[-1][-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[5] = counts_edges[idx[0], 1] # exon_pre_exon_aft_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exon_pre[-1], seg_exon_aft[0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[6] = counts_edges[idx[0], 1] for i in range(len(seg_exons) - 1): # sum_inner_exon_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [seg_exons[i][-1], seg_exons[i + 1][0]], segs.seg_edges.shape))[0] if len(idx.shape) > 0 and idx.shape[0] > 0: info[7] += counts_edges[idx[0], 1] # num_inner_exon info[8] = event.exons2.shape[0] - 2 info[9] = sp.sum(event.exons2[1:-1, 1] - event.exons2[1:-1, 0]) if info[4] >= CFG['mult_exon_skip']['min_non_skip_count']: verified[1] = 1 if info[5] >= CFG['mult_exon_skip']['min_non_skip_count']: verified[2] = 1 if (info[7] / info[8]) >= CFG['mult_exon_skip']['min_non_skip_count']: verified[3] = 1 if info[6] >= CFG['mult_exon_skip']['min_skip_count']: verified[4] = 1 return (verified, info)
def verify_intron_retention(event, gene, counts_segments, counts_edges, counts_seg_pos, CFG): # [verified, info] = verify_intron_retention(event, fn_bam, CFG) verified = [0, 0] # (0) valid, (1) intron_cov, (2) exon1_cov, (3), exon2_cov # (4) intron_conf, (5) intron_cov_region info = [1, 0, 0, 0, 0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of exon coordinates (start < stop && non-overlapping) elif sp.any(event.exons1[:, 1] - event.exons1[:, 0] < 1) or sp.any((event.exons2[1] - event.exons2[0]) < 1): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon1 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] idx_exon2 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] ### find segments corresponding to exons seg_exon1 = sp.sort(sp.where(segs.seg_match[idx_exon1, :])[1]) seg_exon2 = sp.sort(sp.where(segs.seg_match[idx_exon2, :])[1]) seg_all = sp.arange(seg_exon1[0], seg_exon2[-1]) seg_intron = sp.setdiff1d(seg_all, seg_exon1) seg_intron = sp.setdiff1d(seg_intron, seg_exon2) assert(seg_intron.shape[0] > 0) seg_lens = segs.segments[1, :] - segs.segments[0, :] ### compute exon coverages as mean of position wise coverage # exon1_cov info[2] = sp.sum(counts_segments[seg_exon1] * seg_lens[seg_exon1]) / sp.sum(seg_lens[seg_exon1]) # exon2_cov info[3] = sp.sum(counts_segments[seg_exon2] * seg_lens[seg_exon2]) / sp.sum(seg_lens[seg_exon2]) # intron_cov info[1] = sp.sum(counts_segments[seg_intron] * seg_lens[seg_intron]) / sp.sum(seg_lens[seg_intron]) # intron_cov_region info[5] = sp.sum(counts_seg_pos[seg_intron]) / sp.sum(seg_lens[seg_intron]) ### check if counts match verification criteria if info[1] > CFG['intron_retention']['min_retention_cov'] and \ info[5] > CFG['intron_retention']['min_retention_region'] and \ info[1] >= CFG['intron_retention']['min_retention_rel_cov'] * (info[2] + info[3]) / 2: verified[0] = 1 ### check intron confirmation as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([seg_exon1[-1], seg_exon2[0]], segs.seg_edges.shape))[0] info[4] = counts_edges[idx, 1] if info[4] >= CFG['intron_retention']['min_non_retention_count']: verified[1] = 1 return (verified, info)
def count_graph_coverage(genes, fn_bam=None, CFG=None, fn_out=None): # [counts] = count_graph_coverage(genes, fn_bam, CFG, fn_out) if fn_bam is None and isinstance(genes, dict): PAR = genes genes = PAR['genes'] fn_bam = PAR['fn_bam'] if 'fn_out' in PAR: fn_out = PAR['fn_out'] CFG = PAR['CFG'] if not isinstance(fn_bam, list): fn_bam = [fn_bam] counts = sp.zeros((len(fn_bam), genes.shape[0]), dtype='object') intron_tol = 0 for f in range(counts.shape[0]): ### iterate over all genes and generate counts for ### the segments in the segment graph ### and the splice junctions in the splice graph for i in range(genes.shape[0]): sys.stdout.write('.') if i > 0 and i % 50 == 0: sys.stdout.write('%i\n' % i) gg = genes[i] if gg.segmentgraph is None: gg.segmentgraph = Segmentgraph(gg) gg.start = gg.segmentgraph.segments.ravel().min() gg.stop = gg.segmentgraph.segments.ravel().max() ### add RNA-seq evidence to the gene structure (tracks, intron_list) = add_reads_from_bam(gg, fn_bam[f], ['exon_track','intron_list'], CFG['read_filter'], CFG['var_aware'], CFG['primary_only']); intron_list = intron_list[0] ### TODO ### extract mean exon coverage for all segments counts[f, i] = Counts(gg.segmentgraph.segments.shape[1]) for j in range(gg.segmentgraph.segments.shape[1]): idx = sp.arange(gg.segmentgraph.segments[0, j], gg.segmentgraph.segments[1, j]) - gg.start counts[f, i].segments[j] = sp.mean(sp.sum(tracks[:, idx], axis=0)) counts[f, i].seg_pos[j] = sp.sum(sp.sum(tracks[:, idx], axis=0) > 0) ### extract intron counts k, l = sp.where(gg.segmentgraph.seg_edges == 1) for m in range(k.shape[0]): idx = sp.where((sp.absolute(intron_list[:, 0] - gg.segmentgraph.segments[1, k[m]]) <= intron_tol) & (sp.absolute(intron_list[:, 1] - gg.segmentgraph.segments[0, l[m]]) <= intron_tol))[0] if counts[f, i].edges.shape[0] == 0: if idx.shape[0] > 0: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), sp.sum(intron_list[idx, 2])])) else: counts[f, i].edges = sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0])) else: if idx.shape[0] > 0: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), sp.sum(intron_list[idx, 2])]))] else: counts[f, i].edges = sp.r_[counts[f, i].edges, sp.atleast_2d(sp.array([sp.ravel_multi_index([k[m], l[m]], gg.segmentgraph.seg_edges.shape), 0]))] if fn_out is not None: cPickle.dump(counts, open(fn_out, 'w'), -1) else: return counts