def read_exons(gtf): transcripts = defaultdict(pyinter.IntervalSet) totlen = 0 names = [] trs, ids = [], [] for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf) if x[0] != "#"): if toks[2] not in("CDS", "stop_codon") or toks[1] not in("protein_coding"): continue #if toks[0] != "1": break start, end = map(int, toks[3:5]) assert start <= end, toks transcript = toks[8].split('transcript_id "')[1].split('"', 1)[0] transcripts[transcript].add(pyinter.closedopen(start-1, end)) names.append(toks[8].split('transcript_name "')[1].split('"', 1)[0].rsplit("-", 1)[0]) ids.append(toks[8].split('gene_id "')[1].split('"', 1)[0]) trs.append(toks[8].split('transcript_id "')[1].split('"', 1)[0]) # sort by start so we can do binary search. # TODO: need to remove overlapping exons so we don't double-count transcripts = dict((k, sorted(v)) for k, v in transcripts.iteritems()) #ends = dict((k, sorted(v)) for k, v in ends.iteritems()) ints={} lens=pyinter.IntervalSet() for tr, ivset in transcripts.iteritems(): sends = sorted(list(ivset)) iset=pyinter.IntervalSet(pyinter.closedopen(x.lower_value,x.upper_value) for x in sends) lens = lens.union(iset) ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends] ints[tr] = (ss,es) totlen = sum(x.upper_value-x.lower_value for x in lens) return ints, set(names), set(ids), set(trs), totlen
def load_genome_gaps(gapsfile, chrom_name): gaps = pyinter.IntervalSet() with open(gapsfile, 'r') as file: lines = [l for l in file.readlines() if l.split('\t')[0] == chrom_name] for line in lines: toks = line.split('\t') a, b = int(toks[1]), int(toks[2]) gaps.add(pyinter.closedopen(a, b)) return gaps
def test_get_insertion_overlap_positions(): blocks = [ GenomeInterval(1, 0, 100), # 01 GenomeInterval(1, 100, 200), # 23 GenomeInterval(1, 210, 300), # 45 GenomeInterval(1, 350, 360), # 67 GenomeInterval(1, 370, 400), # 89 GenomeInterval(1, 0, 100, True), # 10, 11 GenomeInterval(1, 0, 10, True) ] # 12, 13 paths = (list(range(10)), [0, 1, 10, 11, 2, 3], [0, 1, 2, 3, 10, 11, 2, 3], [0, 1, 2, 3, 12, 13, 2, 3], [0, 1, 2, 3, 4, 5, 10, 11, 6, 7], [0, 1, 2, 3, 4, 5, 12, 13, 6, 7]) truth = [ tuple(), ((80, 170), ), ((185, 275), ), tuple(), ((305, 395), ), tuple() ] rlen = 50 m = 20 for i in range(len(truth)): out, _, _ = get_insertion_overlap_positions(paths[i], blocks, rlen, m) inter = pyinter.IntervalSet() for interval in truth[i]: inter.add(pyinter.open(interval[0], interval[1])) print('truth: {0}\nresult: {1}\n'.format(inter, out)) assert (out == inter) blocks = [ GenomeInterval(1, 0, 100), GenomeInterval(1, 200, 300), GenomeInterval(0, 350, 400), GenomeInterval(1, 0, 50, True), GenomeInterval(1, 0, 50, True) ] path = [0, 1, 6, 7, 2, 3, 8, 9, 4, 5] truth = [(130, 170), (355, 395)] out, _, _ = get_insertion_overlap_positions(path, blocks, rlen, m) inter = pyinter.IntervalSet() for interval in truth: inter.add(pyinter.open(interval[0], interval[1])) print('truth: {0}\nresult: {1}\n'.format(inter, out)) assert (out == inter)
def get_insertion_overlap_positions(path, blocks, read_len, min_mappable=20): invalid_read_start_d = pyinter.IntervalSet() invalid_read_start_t = pyinter.IntervalSet() invalid_window_start = pyinter.IntervalSet() m = min_mappable R = read_len pos = 0 blocks_gaps = genome_blocks_gaps(blocks, path) for b in blocks_gaps: if b.is_de_novo and 0 < len(b) - R + 2 * m: invalid_read_start_d.add( pyinter.open(pos - m, pos + len(b) - R + m)) elif b.is_translocation and 0 < len(b) - R + 2 * m: invalid_read_start_t.add( pyinter.open(pos - m, pos + len(b) - R + m)) if b.is_insertion(): invalid_window_start.add(pyinter.open(pos - m, pos + len(b))) pos += len(b) invalid_read_start = pyinter.IntervalSet() # weird code here with window_start is required to merge intervals properly for interval in invalid_window_start: if interval.lower_value < interval.upper_value - (R - m): invalid_read_start.add( pyinter.open(interval.lower_value, interval.upper_value - (R - m))) # print(invalid_read_start_d) # print(invalid_read_start_t) # invalid_d_only = invalid_read_start_d.difference(invalid_read_start_t) # invalid_t_only = invalid_read_start_t.difference(invalid_read_start_d) # invalid_both = invalid_read_start_d.intersection(invalid_read_start_t) overlapping_t, overlapping_d = [], [] for interval in invalid_read_start: if any([d.overlaps(interval) for d in invalid_read_start_d]): overlapping_d.append(True) else: overlapping_d.append(False) if any([t.overlaps(interval) for t in invalid_read_start_t]): overlapping_t.append(True) else: overlapping_t.append(False) return invalid_read_start, overlapping_d, overlapping_t
def get_gap_overlap_positions(path, blocks, read_len, min_mappable=20): blocks_gaps = genome_blocks_gaps(blocks, path) m = min_mappable gap_ref = pyinter.IntervalSet() ref = pyinter.IntervalSet() pos = 0 for b in blocks_gaps: if len(b) == 0: continue if not b.is_insertion(): gap_ref.add(pyinter.closedopen(pos, pos + len(b))) if not b.is_gap: ref.add(pyinter.closedopen(pos, pos + len(b))) pos += len(b) # print('gap_ref: {0}\nref: {1}\n'.format(gap_ref, ref)) A1 = pyinter.IntervalSet() # i: [i, i+m) contained in gap_ref A2 = pyinter.IntervalSet() # i: [i, i+m) overlaps ref for iv in gap_ref: if iv.lower_value <= iv.upper_value - m: A1.add(pyinter.closed(iv.lower_value, iv.upper_value - m)) for iv in ref: # print(iv) A2.add(pyinter.closed(iv.lower_value - m + 1, iv.upper_value - 1)) # print(A2) A3 = A1.intersection(A2) A4 = pyinter.IntervalSet() A5 = pyinter.IntervalSet() for iv in A1: A4.add(pyinter.closed(iv.lower_value - read_len + m, iv.upper_value)) for iv in A3: A5.add(pyinter.closed(iv.lower_value - read_len + m, iv.upper_value)) result = A4.difference(A5) # print('A1: {0}\nA2: {1}\nA3: {2}\nA4: {3}\nA5: {4}\n'.format(A1, A2, A3, A4, A5)) # print('result: {0}'.format(result)) # print('') # remove any empty intervals out = pyinter.IntervalSet() for iv in result: a = iv.lower_value - 1 if iv.lower_value in iv else iv.lower_value b = iv.upper_value + 1 if iv.upper_value in iv else iv.upper_value # if iv.lower_value in iv or iv.upper_value in iv: # not open # print('A1: {0}\nA2: {1}\nA3: {2}\nA4: {3}\nA5: {4}\n'.format(A1, A2, A3, A4, A5)) # print('result: {0}'.format(result)) # print(iv) # raise Warning('non-open interval in get_gap_positions') if a < b - 1: out.add(pyinter.open(a, b)) return out
def compute_null_dist(opts, discordant_pairs, dtype, insert_mu, insert_sigma, gap_file, lib_idx, lr_cond): nreps = opts['pecluster_null_reps'] chrom_name, start, end = opts['chromosome'], opts['region_start'], opts[ 'region_end'] gaps_inter = load_genome_gaps(gap_file, chrom_name) chrom_inter = pyinter.IntervalSet() chrom_inter.add(pyinter.closedopen(start, end)) non_gaps_inter = chrom_inter.difference(gaps_inter) non_gaps = [(i.lower_value, i.upper_value) for i in non_gaps_inter] total_len = sum([i[1] - i[0] for i in non_gaps]) # For deletion null clusters, don't use pairs that are obviously too large. # (for normal data the discordant read cutoff for deletion supports # is like mu + 3 sigma ~ mu + .3mu, and we're excluding stuff bigger than 3mu) if dtype == 'Del': max_null_insert = insert_mu * opts['insert_max_mu_multiple'] else: max_null_insert = np.Inf null_clusters = [] lr_null_clusters = np.array([], float) for _ in range(nreps): shuffled = shuffle_discordant_pairs(discordant_pairs, total_len, max_insert_size=max_null_insert) clusters_tmp, _ = cluster_pairs(opts, shuffled, dtype, lib_idx, insert_mu, insert_sigma) null_clusters.extend(clusters_tmp) lr_tmp = np.fromiter((lr_fun[dtype](c, insert_mu, insert_sigma, opts['insert_cutoff'], lr_cond) for c in clusters_tmp), float) lr_null_clusters = np.append(lr_null_clusters, lr_tmp) if opts['verbosity'] > 1: print('[compute_null_dist] {0}'.format(dtype)) print('shuffled lr:') print(lr_null_clusters) print('') outname = ('{0}_{1}_null_cluster_{2}reps.txt'.format( opts['library_names'][lib_idx], dtype, nreps)) fname = os.path.join(opts['outdir'], 'logging', outname) write_clustering_results(fname, list(zip(lr_null_clusters, null_clusters)), first_reject=0) # print('there were {0} {1} clusters after shuffling'.format(len(clusters), # dtype)) lr_null_clusters.sort() return lr_null_clusters
def test_get_gap_overlap_positions(): rlen = 50 blocks = [ GenomeInterval(1, 0, 100), GenomeInterval(1, 100, 200), GenomeInterval(1, 249, 300), GenomeInterval(1, 350, 400), GenomeInterval(1, 500, 600) ] paths = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 7, 6, 8, 9]) truth = ([(299, 301), (399, 451)], [(299, 326), (424, 451)]) for i in range(len(truth)): out = get_gap_overlap_positions(paths[i], blocks, rlen) inter = pyinter.IntervalSet() for interval in truth[i]: inter.add(pyinter.open(interval[0], interval[1])) print('truth: {0}\nresult: {1}\n'.format(inter, out)) assert (out == inter) blocks = [ GenomeInterval(1, 0, 100), GenomeInterval(1, 200, 300), GenomeInterval(0, 350, 400), GenomeInterval(1, 0, 50, True), GenomeInterval(1, 0, 50, True) ] path = [0, 1, 6, 7, 2, 3, 8, 9, 4, 5] truth = [(99, 131), (169, 201), (349, 356), (394, 401)] out = get_gap_overlap_positions(path, blocks, rlen) inter = pyinter.IntervalSet() for interval in truth: inter.add(pyinter.open(interval[0], interval[1])) print('truth: {0}\nresult: {1}\n'.format(inter, out)) assert (out == inter)
def test_persons_availability(self): avail = self.p.get_availability( self.range_start, self.range_finish) # type: inter.IntervalSet expected = inter.IntervalSet([ inter.closed( 1491354000, 1491368400), # Wed, 05 Apr 2017 01:00:00 to 05:00:00 GMT # inter.closed(1491958800, 1491973200), # Wed, 12 Apr 2017 01:00:00 to 05:00:00 GMT Second Tuesday! inter.closed( 1492563600, 1492578000), # Wed, 19 Apr 2017 01:00:00 to 05:00:00 GMT inter.closed( 1493168400, 1493182800), # Wed, 25 Apr 2017 01:00:00 to 05:00:00 GMT ]) self.assertEqual(avail, expected)
def test_interval_set(self): iset = self.tp.as_interval_set(self.range_start, self.range_finish) expected = inter.IntervalSet([ inter.closed( 1491354000, 1491368400), # Wed, 05 Apr 2017 01:00:00 to 05:00:00 GMT inter.closed( 1491958800, 1491973200), # Wed, 12 Apr 2017 01:00:00 to 05:00:00 GMT inter.closed( 1492563600, 1492578000), # Wed, 19 Apr 2017 01:00:00 to 05:00:00 GMT inter.closed( 1493168400, 1493182800), # Wed, 25 Apr 2017 01:00:00 to 05:00:00 GMT ]) self.assertEqual(iset, expected)
def constraints_unknown_sigma( \ support_directions, RHS_offsets, LHS_offsets, observed_data, direction_of_interest, RSS, RSS_df, value_under_null=0., tol = 1.e-4, DEBUG=False): r""" Given a quasi-affine constraint $\{z:Az+u \leq \hat{\sigma}b\}$ (elementwise) specified with $A$ as `support_directions` and $b$ as `support_offset`, a new direction of interest $\eta$, and an `observed_data` is Gaussian vector $Z \sim N(\mu,\sigma^2 I)$ with $\sigma$ unknown, this function returns $\eta^TZ$ as well as a set bounding this value. The value of $\hat{\sigma}$ is taken to be sqrt(RSS/RSS_df) The interval constructed is such that the endpoints are independent of $\eta^TZ$, hence the selective $T$ distribution of of `sample carving`_ can be used to form an exact pivot. To construct the interval, we are in effect conditioning on all randomness perpendicular to the direction of interest, i.e. $P_{\eta}^{\perp}X$ where $X$ is the Gaussian data vector. Notes ----- Covariance is assumed to be an unknown multiple of the identity. Parameters ---------- support_directions : np.float Matrix specifying constraint, $A$. RHS : np.float Offset in constraint, $b$. LHS_offsets : np.float Offset in LHS of constraint, $u$. observed_data : np.float Observations. direction_of_interest : np.float Direction in which we're interested for the contrast. RSS : float Residual sum of squares. RSS_df : int Degrees of freedom of RSS. tol : float Relative tolerance parameter for deciding sign of $Az-b$. Returns ------- lower_bound : float observed : float upper_bound : float sigma : float """ # shorthand A, b, L, X, w, theta = (support_directions, RHS_offsets, LHS_offsets, observed_data, direction_of_interest, value_under_null) # make direction of interest a unit vector normw = np.linalg.norm(w) w = w / normw theta = theta / normw sigma_hat = np.sqrt(RSS / RSS_df) # compute the sufficient statistics U = (w * X).sum() - theta V = X - (X * w).sum() * w W = sigma_hat**2 * RSS_df + U**2 Tobs = U / np.sqrt((W - U**2) / RSS_df) sqrtW = np.sqrt(W) alpha = np.dot(A, w) gamma = theta * alpha + np.dot(A, V) + L Anorm = np.fabs(A).max() intervals = [] intervals = [] for _a, _b, _c in zip(alpha, b, gamma): _a = _a * sqrtW _b = _b * sqrtW cur_intervals = sqrt_inequality_solver(_a, _c, _b, RSS_df) intervals.append( pyinter.IntervalSet( [pyinter.closed(*i) for i in cur_intervals if i])) truncation_set = intervals[0] for interv in intervals[1:]: truncation_set = truncation_set.intersection(interv) if not truncation_set: raise ValueError("empty truncation intervals") return truncation_set, Tobs
def svelter_convert(svelterfile, outdir, reffile, filter_gaps=False, refgapfile=None, flank_size=1000, verbosity=0): os.system('mkdir -p %s' % outdir) # collect all bps # all_bp = [] # with open(svelterfile, 'r') as svelter: # for line in svelter: # if is_svelter_header(line): # continue # bp_str = line.split('\t')[3].split(':')[1:] # all_bp.extend(int(x) for x in bp_str) # all_bp.sort() log = open(os.path.join(outdir, 'convert_{0}.log'.format(svelterfile)), 'w') data = [] # it seems some sv can be repeated in svelter output with different scores seen_svstring = set() seen_id = {} skipped_seen = 0 skipped_refgap = 0 with open(svelterfile, 'r') as svelter: toks_list = [line.rstrip().split('\t') for line in svelter] if filter_gaps: chroms = set(toks[0] for toks in toks_list) chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms} else: chrom_gaps = None for toks in toks_list: # check if header if toks[0] == 'chr' and toks[1] == 'start': continue # check if passing score if float(toks[6]) == 0: continue # check if sv is duplicate svstring = ' '.join(toks[:6]) if svstring in seen_svstring: skipped_seen += 1 continue else: seen_svstring.add(svstring) # adjust id if we've seen it before id = toks[3] num_id_seen = seen_id.get(id, 0) seen_id[id] = num_id_seen + 1 if num_id_seen > 0: print('saw {0} again'.format(id)) id_extra = ';' + str(num_id_seen + 1) else: id_extra = '' chrom = toks[0] bp_str = toks[3].split(':')[1:] bp = [int(x) for x in bp_str] if filter_gaps: sv_interval = pyinter.closedopen(bp[0], bp[-1]) sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval]) if len(sv_gap_intersection) > 0: skipped_refgap += 1 continue breakpoints = {(x, x): Breakpoint((x, x)) for x in bp} # il = bisect_left(all_bp, bp[0]) # if il > 0: # slop_left = min(all_bp[il] - all_bp[il-1], flank_size) # else: # slop_left = flank_size # ir = bisect_right(all_bp, bp[-1]) # if ir < len(all_bp): # slop_right = min(all_bp[ir] - all_bp[ir-1], flank_size) # else: # slop_right = flank_size slop_left, slop_right = flank_size, flank_size start = bp[0] - slop_left end = bp[-1] + slop_right cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity) blocks, _, left_bp, right_bp = cbout svelter_strings = toks[5].split('/') paths = [svelter_string_to_path(x, len(blocks)) for x in svelter_strings] score = float(toks[6]) this_data = (paths, blocks, left_bp, right_bp, score, 'PASS', id_extra, None, None) # no extra INFO/FORMAT tags like VCF vase data.append(this_data) log.write('skipped_seen\t{0}\n'.format(skipped_seen)) log.write('skipped_refgap\t{0}\n'.format(skipped_refgap)) do_sv_processing(data, outdir, reffile, log, verbosity) svelter.close() log.close()
def generic_vcf_convert(vcffile, outdir, reffile, filter_gaps=False, refgapfile=None, caller=None, flank_size=1000, verbosity=0): os.system('mkdir -p %s' % outdir) vcf = open(vcffile, 'r') log = open(os.path.join(outdir, 'convert_{0}.log'.format(vcffile)), 'w') data = [] svtype_skipped = {} seen_coords_count = {} skipped_refgap = 0 write_extra = False # need to write FORMAT or INFO to file? with open(vcffile, 'r') as vcf: toks_list = [line.rstrip().split('\t') for line in vcf if line[0] != '#'] if filter_gaps: chroms = set(toks[0] for toks in toks_list) chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms} else: chrom_gaps = None for toks in toks_list: # NOTE not parsing qual; do filtering beforehand for DELLY chrom, pos, id, ref, alt, qual, filterstring, info, format, sample1 = toks # VCF is 1-indexed, but specifies pos/end positions # which are to the left of breakpoints, so no adjustment pos = int(pos) tags = info.split(';') if 'PRECISE' in tags: filterstring += ':PRECISE' elif 'IMPRECISE' in tags: filterstring += ':IMPRECISE' elif caller == 'lumpy': # only includes tags for imprecise events filterstring += ':PRECISE' tags = [t for t in tags if '=' in t] tagd = {t.split('=')[0]: t.split('=')[1] for t in tags} end = int(tagd.get('END', -99999)) svtype = tagd['SVTYPE'] if caller == 'pindel' and svtype == 'INS': inslen = int(tagd['SVLEN']) else: inslen = int(tagd.get('INSLEN', 0)) if caller == 'pindel': homlen = int(tagd['HOMLEN']) if pos + homlen > end or svtype == 'INS': print('pos + homlen > end: positions {0}'.format((pos, end))) cipos = (0, 0) ciend = (0, 0) else: cipos = (0, homlen) ciend = (0, homlen) else: if 'CIPOS95' in tagd: # LUMPY tmp = tagd['CIPOS95'].split(',') cipos = (int(tmp[0]), int(tmp[1])) elif 'CIPOS' in tagd: tmp = tagd['CIPOS'].split(',') cipos = (int(tmp[0]), int(tmp[1])) else: cipos = (0, 0) if 'CIEND95' in tagd: # LUMPY tmp = tagd['CIEND95'].split(',') ciend = (int(tmp[0]), int(tmp[1])) elif 'CIEND' in tagd: tmp = tagd['CIEND'].split(',') ciend = (int(tmp[0]), int(tmp[1])) else: ciend = (0, 0) split_support = int(tagd.get('SR', 0)) pe_support = int(tagd.get('PE', 0)) # lumpy STRANDS only relevant for inversions if caller == 'lumpy' and svtype == 'INV': tmp = tagd['STRANDS'].split(',') tmpd = {a: b for (a, b) in (p.split(':') for p in tmp)} tagd['INV_PLUS'] = tmpd['++'] tagd['INV_MINUS'] = tmpd['--'] tagd_used = ('SR', 'PE', 'SVTYPE', 'SVMETHOD', 'END', 'STRANDS', 'SVLEN', 'HOMSEQ', 'CONSENSUS', 'CHR2') tagd_extra = {k: v for (k, v) in tagd.items() if k not in tagd_used} tags2 = {k: v for (k, v) in zip(format.split(':'), sample1.split(':'))} if 'AD' in tags2: # pindel split_support = int(tags2['AD'].split(',')[1]) gt = tags2['GT'] if gt == './.' or gt == '.|.': is_het = False filterstring += ':NOGT' elif gt in ('0/0', '0|0'): is_het = False filterstring += ':ZEROGT' elif gt in ('0/1', '1/0', '0|1', '1|0'): is_het = True else: assert(gt in ('1/1', '1|1')) is_het = False tags2_used = ('AD', 'SR', 'PE', 'SU') tags2_extra = {k: v for (k, v) in tags2.items() if k not in tags2_used} if len(tagd_extra) + len(tags2_extra) > 0: write_extra = True # cases if svtype == 'DEL': path = (0, 1, 4, 5) refpath = (0, 1, 2, 3, 4, 5) supptype = 'Del' elif svtype == 'INV': path = (0, 1, 3, 2, 4, 5) refpath = (0, 1, 2, 3, 4, 5) supptype = 'InvL' elif svtype == 'DUP' or svtype == 'DUP:TANDEM': path = (0, 1, 2, 3, 2, 3, 4, 5) refpath = (0, 1, 2, 3, 4, 5) supptype = 'Dup' elif svtype == 'INS': # INSERTIONS parse inslen, add insertion block to blocks path = (0, 1, 4, 5, 2, 3) refpath = (0, 1, 2, 3) supptype = 'Ins' else: # skipping delly TRA # skipping BND events as they may be ambiguous, in terms of the path svtype_skipped[svtype] = svtype_skipped.get(svtype, 0) + 1 continue # check ref gap overlap if filter_gaps and end > pos: sv_interval = pyinter.closedopen(pos, end) sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval]) if len(sv_gap_intersection) > 0: skipped_refgap += 1 continue # create breakpoints and blocks, keeping in mind uncertainty and possible insertion if caller == 'lumpy' and svtype != 'INS': # lumpy intervals are not symmetric. POS and END are each the "best guess" for # the breakpoints bp = [(pos, pos), (end, end)] elif svtype != 'INS': # if (cipos[1] != -cipos[0] or ciend[1] != -ciend[0]) and \ # (pos + cipos[1] < end + ciend[0]): if (pos + cipos[1] < end + ciend[0]): bp = [(pos + cipos[0], pos + cipos[1]), (end + ciend[0], end + ciend[1])] else: bp = [(pos, pos), (end, end)] filterstring += ':BPOVERLAP' else: # if cipos[1] != -cipos[0]: if cipos[1] > cipos[0]: bp = [(pos + cipos[0], pos + cipos[1])] else: bp = [(pos, pos)] pe = [(x, supptype) for x in range(pe_support)] # TODO SupportingSplit splits = [] for i in range(split_support): aln_tmp = pysam.AlignedSegment() aln_tmp.qname = i aln_tmp.is_read1 = True split_type = supptype + '+' splits.append(SupportingSplit(aln_tmp, None, None, None, None, split_type)) breakpoints = {x: Breakpoint(x, pe=pe, splits=splits) for x in bp} slop_left, slop_right = flank_size, flank_size start = bp[0][0] - slop_left end = bp[-1][1] + slop_right cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity) blocks, _, left_bp, right_bp = cbout if svtype == 'INS': blocks.append(GenomeInterval(chrom, 0, inslen, is_de_novo=True)) paths = [path, refpath] if is_het else [path, path] score = 0 coords = (start, end) scc = seen_coords_count.get(coords, 0) if scc > 0: id_extra = chr(ord('a') + scc) else: id_extra = '' seen_coords_count[coords] = scc + 1 this_data = (paths, blocks, left_bp, right_bp, score, filterstring, id_extra, tagd_extra, tags2_extra) data.append(this_data) for svtype, count in svtype_skipped.items(): log.write('skipped_svtype\t{0}\t{1}\n'.format(svtype, count)) log.write('skipped_refgap\t{0}\n'.format(skipped_refgap)) do_sv_processing(data, outdir, reffile, log, verbosity, write_extra) vcf.close() log.close()
def create_blocks(breakpoints, gaps, chrom_name, start, end, verbosity): # create list of blocks between breakpoints # while adjusting for genome gaps gap_indices = set() gap_indices.add(0) blocks = [] left_breakpoints = [] right_breakpoints = [] breakpoints[(end, end)] = Breakpoint((end, end)) bploc = list(breakpoints.keys()) bploc.sort() last_end = start last_breakpoint = Breakpoint((start, start)) for bpl in bploc: breakpoint = breakpoints[bpl] if bpl[0] <= start or bpl[1] > end: continue iset = pyinter.IntervalSet() blockinterval = pyinter.closedopen(last_end, bpl[0]) iset.add(blockinterval) adjusted_blocks = iset.difference(gaps) adjusted_blocks = sorted(list(adjusted_blocks)) if verbosity > 1: print('bploc {0}'.format(bpl)) print('bp {0}'.format(breakpoint)) print('blockinterval {0}'.format(blockinterval)) print('adjusted {0}'.format(adjusted_blocks)) for ab in adjusted_blocks: if ab.lower_value == ab.upper_value: # block completely within a gap gap_indices.add(len(blocks)) break else: if ab.lower_value != blockinterval.lower_value: gap_indices.add(len(blocks)) left_breakpoint = Breakpoint( (ab.lower_value, ab.lower_value)) else: left_breakpoint = last_breakpoint if ab.upper_value != blockinterval.upper_value: gap_indices.add(len(blocks) + 1) right_breakpoint = Breakpoint( (ab.upper_value, ab.upper_value)) else: right_breakpoint = breakpoint if verbosity > 1: print('adding {0}'.format( GenomeInterval(chrom_name, ab.lower_value, ab.upper_value))) print('\tleft {0}'.format(left_breakpoint)) print('\tright {0}'.format(right_breakpoint)) blocks.append( GenomeInterval(chrom_name, ab.lower_value, ab.upper_value)) left_breakpoints.append(left_breakpoint) right_breakpoints.append(right_breakpoint) last_end = bpl[1] last_breakpoint = breakpoints[bpl] gap_indices.add(len(blocks)) gap_indices = sorted(list(gap_indices)) if verbosity > 1: print('--creating blocks--') print(breakpoints) print(blocks) print(gap_indices) print(left_breakpoints) print(right_breakpoints) return blocks, gap_indices, left_breakpoints, right_breakpoints
def get_time_intervals(time_points): inter_lst = [] for start, end in pairwise(time_points): inter_lst.append(pyinter.closed(start, end)) intervalSet = pyinter.IntervalSet(inter_lst) return intervalSet
def test_add_variants_to_set_from_bed(self): common_entities = create_common_entities() project = common_entities['project'] self.ref_genome_1 = common_entities['reference_genome'] alignment_group = AlignmentGroup.objects.create( label='Alignment 1', reference_genome=self.ref_genome_1, aligner=AlignmentGroup.ALIGNER.BWA) (self.sample_1, created) = ExperimentSample.objects.get_or_create( project=project, label=SAMPLE_1_LABEL) sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group, experiment_sample=self.sample_1) # Create variants in the bed regions from best_test.bed for var_poor_map in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get( reference_genome=self.ref_genome_1), position=random.randint(101, 200), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={}) for var_no_cov in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get( reference_genome=self.ref_genome_1), position=random.randint(301, 400), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={}) variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get( reference_genome=self.ref_genome_1), position=random.randint(501, 600), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={}) new_bed_path = copy_dataset_to_entity_data_dir( entity=sample_alignment, original_source_location=TEST_BED) bed_dataset = add_dataset_to_entity( sample_alignment, dataset_label=Dataset.TYPE.BED_CALLABLE_LOCI, dataset_type=Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location=new_bed_path) vs_to_v_map = add_variants_to_set_from_bed(sample_alignment, bed_dataset) variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()]) self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']), variant_set_labels) for variant_set, variants in vs_to_v_map.items(): for v in variants: # POOR MAPPING QUAL should be from 101 to 200 if variant_set.label == 'POOR_MAPPING_QUALITY': self.assertTrue(v.position in pyinter.closedopen(101, 200)) # NO COVERAGE should be from 301 to 400, 501 to 600 elif variant_set.label == 'NO_COVERAGE': self.assertTrue(v.position in pyinter.IntervalSet([ pyinter.closedopen(301, 400), pyinter.closedopen(501, 600) ])) else: raise AssertionError('bad variant set %s made.' % variant_set.label)
def len_without_gaps(chrom_name, start, end, gapsfile): gaps = load_genome_gaps(gapsfile, chrom_name) region = pyinter.IntervalSet() region.add(pyinter.closedopen(start, end)) diff = region.difference(gaps) return sum(x.upper_value - x.lower_value for x in diff)