def read_exons(gtf): transcripts = defaultdict(pyinter.IntervalSet) totlen = 0 names = [] trs, ids = [], [] for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf) if x[0] != "#"): if toks[2] not in("CDS", "stop_codon") or toks[1] not in("protein_coding"): continue #if toks[0] != "1": break start, end = map(int, toks[3:5]) assert start <= end, toks transcript = toks[8].split('transcript_id "')[1].split('"', 1)[0] transcripts[transcript].add(pyinter.closedopen(start-1, end)) names.append(toks[8].split('transcript_name "')[1].split('"', 1)[0].rsplit("-", 1)[0]) ids.append(toks[8].split('gene_id "')[1].split('"', 1)[0]) trs.append(toks[8].split('transcript_id "')[1].split('"', 1)[0]) # sort by start so we can do binary search. # TODO: need to remove overlapping exons so we don't double-count transcripts = dict((k, sorted(v)) for k, v in transcripts.iteritems()) #ends = dict((k, sorted(v)) for k, v in ends.iteritems()) ints={} lens=pyinter.IntervalSet() for tr, ivset in transcripts.iteritems(): sends = sorted(list(ivset)) iset=pyinter.IntervalSet(pyinter.closedopen(x.lower_value,x.upper_value) for x in sends) lens = lens.union(iset) ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends] ints[tr] = (ss,es) totlen = sum(x.upper_value-x.lower_value for x in lens) return ints, set(names), set(ids), set(trs), totlen
def get_gap_overlap_positions(path, blocks, read_len, min_mappable=20): blocks_gaps = genome_blocks_gaps(blocks, path) m = min_mappable gap_ref = pyinter.IntervalSet() ref = pyinter.IntervalSet() pos = 0 for b in blocks_gaps: if len(b) == 0: continue if not b.is_insertion(): gap_ref.add(pyinter.closedopen(pos, pos + len(b))) if not b.is_gap: ref.add(pyinter.closedopen(pos, pos + len(b))) pos += len(b) # print('gap_ref: {0}\nref: {1}\n'.format(gap_ref, ref)) A1 = pyinter.IntervalSet() # i: [i, i+m) contained in gap_ref A2 = pyinter.IntervalSet() # i: [i, i+m) overlaps ref for iv in gap_ref: if iv.lower_value <= iv.upper_value - m: A1.add(pyinter.closed(iv.lower_value, iv.upper_value - m)) for iv in ref: # print(iv) A2.add(pyinter.closed(iv.lower_value - m + 1, iv.upper_value - 1)) # print(A2) A3 = A1.intersection(A2) A4 = pyinter.IntervalSet() A5 = pyinter.IntervalSet() for iv in A1: A4.add(pyinter.closed(iv.lower_value - read_len + m, iv.upper_value)) for iv in A3: A5.add(pyinter.closed(iv.lower_value - read_len + m, iv.upper_value)) result = A4.difference(A5) # print('A1: {0}\nA2: {1}\nA3: {2}\nA4: {3}\nA5: {4}\n'.format(A1, A2, A3, A4, A5)) # print('result: {0}'.format(result)) # print('') # remove any empty intervals out = pyinter.IntervalSet() for iv in result: a = iv.lower_value - 1 if iv.lower_value in iv else iv.lower_value b = iv.upper_value + 1 if iv.upper_value in iv else iv.upper_value # if iv.lower_value in iv or iv.upper_value in iv: # not open # print('A1: {0}\nA2: {1}\nA3: {2}\nA4: {3}\nA5: {4}\n'.format(A1, A2, A3, A4, A5)) # print('result: {0}'.format(result)) # print(iv) # raise Warning('non-open interval in get_gap_positions') if a < b - 1: out.add(pyinter.open(a, b)) return out
def generate_gbk_feature_index(genbank_path, feature_index_output_path): """ Create a pickled pyinterval index of genbank features so we can pull them quickly. """ gbk_feature_list = [] with open(genbank_path, 'r') as fh: for seq_record in SeqIO.parse(fh, 'genbank'): interval_list = [] for f in seq_record.features: if f.type not in GBK_FEATURES_TO_EXTRACT: continue f_ivl = pyinter.closedopen( f.location.start, f.location.end) f_ivl.type = f.type if 'gene' in f.qualifiers: f_ivl.name = f.qualifiers['gene'][0] elif 'mobile_element_type' in f.qualifiers: f_ivl.name = f.qualifiers['mobile_element_type'][0] # For now, if the gene has no '.name' or '.mobile_element_type', ignore gbk_feature_list.append(f_ivl) with open(feature_index_output_path, 'w') as fh: pickle.dump(gbk_feature_list, fh)
def get_features_at_locations(ref_genome, intervals, chromosome=None): """ Use the genbank index dataset and return gene or mobile element names that are within these intervals. """ feature_index_path = get_dataset_with_type(ref_genome, Dataset.TYPE.FEATURE_INDEX).get_absolute_location() with open(feature_index_path, 'r') as fh: gbk_feature_list = pickle.load(fh) # Dictionary of features to return, for each interval. return_features = {} # For each input interval, return a list of feature names that # overlap. for interval in intervals: q_ivl = pyinter.closedopen(*interval) features = [f_ivl for f_ivl in gbk_feature_list if q_ivl.intersect(f_ivl)] return_features[interval] = features return return_features
def get_features_at_locations(ref_genome, intervals, chromosome=None): """ Use the genbank index dataset and return gene or mobile element names that are within these intervals. """ feature_index_path = get_dataset_with_type( ref_genome, Dataset.TYPE.FEATURE_INDEX).get_absolute_location() with open(feature_index_path, 'r') as fh: gbk_feature_list = pickle.load(fh) # Dictionary of features to return, for each interval. return_features = {} # For each input interval, return a list of feature names that # overlap. for interval in intervals: q_ivl = pyinter.closedopen(*interval) features = [ f_ivl for f_ivl in gbk_feature_list if q_ivl.intersect(f_ivl) ] return_features[interval] = features return return_features
def generate_gbk_feature_index(genbank_path, feature_index_output_path): """ Create a pickled pyinterval index of genbank features so we can pull them quickly. """ gbk_feature_list = [] with open(genbank_path, 'r') as fh: for seq_record in SeqIO.parse(fh, 'genbank'): interval_list = [] for f in seq_record.features: if f.type not in GBK_FEATURES_TO_EXTRACT: continue f_ivl = pyinter.closedopen(f.location.start, f.location.end) f_ivl.type = f.type if 'gene' in f.qualifiers: f_ivl.name = f.qualifiers['gene'][0] elif 'mobile_element_type' in f.qualifiers: f_ivl.name = f.qualifiers['mobile_element_type'][0] # For now, if the gene has no '.name' or '.mobile_element_type', ignore gbk_feature_list.append(f_ivl) with open(feature_index_output_path, 'w') as fh: pickle.dump(gbk_feature_list, fh)
def load_genome_gaps(gapsfile, chrom_name): gaps = pyinter.IntervalSet() with open(gapsfile, 'r') as file: lines = [l for l in file.readlines() if l.split('\t')[0] == chrom_name] for line in lines: toks = line.split('\t') a, b = int(toks[1]), int(toks[2]) gaps.add(pyinter.closedopen(a, b)) return gaps
def read_repeats(path,keyname): tracks = defaultdict(pyinter.IntervalSet) for toks in (x.rstrip('\r\n').split() for x in ts.nopen(path) if x[0] != "#"): start, end = map(int, toks[1:3]) assert start <= end, toks tracks[keyname].add(pyinter.closedopen(start, end)) ints={} for pid, ivset in tracks.iteritems(): sends = sorted(list(ivset)) ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends] ints[pid] = (ss,es) return ints
def compute_null_dist(opts, discordant_pairs, dtype, insert_mu, insert_sigma, gap_file, lib_idx, lr_cond): nreps = opts['pecluster_null_reps'] chrom_name, start, end = opts['chromosome'], opts['region_start'], opts[ 'region_end'] gaps_inter = load_genome_gaps(gap_file, chrom_name) chrom_inter = pyinter.IntervalSet() chrom_inter.add(pyinter.closedopen(start, end)) non_gaps_inter = chrom_inter.difference(gaps_inter) non_gaps = [(i.lower_value, i.upper_value) for i in non_gaps_inter] total_len = sum([i[1] - i[0] for i in non_gaps]) # For deletion null clusters, don't use pairs that are obviously too large. # (for normal data the discordant read cutoff for deletion supports # is like mu + 3 sigma ~ mu + .3mu, and we're excluding stuff bigger than 3mu) if dtype == 'Del': max_null_insert = insert_mu * opts['insert_max_mu_multiple'] else: max_null_insert = np.Inf null_clusters = [] lr_null_clusters = np.array([], float) for _ in range(nreps): shuffled = shuffle_discordant_pairs(discordant_pairs, total_len, max_insert_size=max_null_insert) clusters_tmp, _ = cluster_pairs(opts, shuffled, dtype, lib_idx, insert_mu, insert_sigma) null_clusters.extend(clusters_tmp) lr_tmp = np.fromiter((lr_fun[dtype](c, insert_mu, insert_sigma, opts['insert_cutoff'], lr_cond) for c in clusters_tmp), float) lr_null_clusters = np.append(lr_null_clusters, lr_tmp) if opts['verbosity'] > 1: print('[compute_null_dist] {0}'.format(dtype)) print('shuffled lr:') print(lr_null_clusters) print('') outname = ('{0}_{1}_null_cluster_{2}reps.txt'.format( opts['library_names'][lib_idx], dtype, nreps)) fname = os.path.join(opts['outdir'], 'logging', outname) write_clustering_results(fname, list(zip(lr_null_clusters, null_clusters)), first_reject=0) # print('there were {0} {1} clusters after shuffling'.format(len(clusters), # dtype)) lr_null_clusters.sort() return lr_null_clusters
def read_pfam(path): tracks = defaultdict(pyinter.IntervalSet) pids, trs, ids = [], [], [] for toks in (x.rstrip('\r\n').split() for x in ts.nopen(path) if x[0] != "#"): start, end = map(int, toks[1:3]) assert start <= end, toks pid = toks[10].split(';',1)[0].strip('"') #pfamA_id tracks[pid].add(pyinter.closedopen(start, end)) ids.append(toks[12].split(';',1)[0].strip('"')) #gene_name trs.append(toks[14].split(';',1)[0].strip('"')) #transcript_id ints={} for pid, ivset in tracks.iteritems(): sends = sorted(list(ivset)) ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends] ints[pid] = (ss,es) return ints
def svelter_convert(svelterfile, outdir, reffile, filter_gaps=False, refgapfile=None, flank_size=1000, verbosity=0): os.system('mkdir -p %s' % outdir) # collect all bps # all_bp = [] # with open(svelterfile, 'r') as svelter: # for line in svelter: # if is_svelter_header(line): # continue # bp_str = line.split('\t')[3].split(':')[1:] # all_bp.extend(int(x) for x in bp_str) # all_bp.sort() log = open(os.path.join(outdir, 'convert_{0}.log'.format(svelterfile)), 'w') data = [] # it seems some sv can be repeated in svelter output with different scores seen_svstring = set() seen_id = {} skipped_seen = 0 skipped_refgap = 0 with open(svelterfile, 'r') as svelter: toks_list = [line.rstrip().split('\t') for line in svelter] if filter_gaps: chroms = set(toks[0] for toks in toks_list) chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms} else: chrom_gaps = None for toks in toks_list: # check if header if toks[0] == 'chr' and toks[1] == 'start': continue # check if passing score if float(toks[6]) == 0: continue # check if sv is duplicate svstring = ' '.join(toks[:6]) if svstring in seen_svstring: skipped_seen += 1 continue else: seen_svstring.add(svstring) # adjust id if we've seen it before id = toks[3] num_id_seen = seen_id.get(id, 0) seen_id[id] = num_id_seen + 1 if num_id_seen > 0: print('saw {0} again'.format(id)) id_extra = ';' + str(num_id_seen + 1) else: id_extra = '' chrom = toks[0] bp_str = toks[3].split(':')[1:] bp = [int(x) for x in bp_str] if filter_gaps: sv_interval = pyinter.closedopen(bp[0], bp[-1]) sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval]) if len(sv_gap_intersection) > 0: skipped_refgap += 1 continue breakpoints = {(x, x): Breakpoint((x, x)) for x in bp} # il = bisect_left(all_bp, bp[0]) # if il > 0: # slop_left = min(all_bp[il] - all_bp[il-1], flank_size) # else: # slop_left = flank_size # ir = bisect_right(all_bp, bp[-1]) # if ir < len(all_bp): # slop_right = min(all_bp[ir] - all_bp[ir-1], flank_size) # else: # slop_right = flank_size slop_left, slop_right = flank_size, flank_size start = bp[0] - slop_left end = bp[-1] + slop_right cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity) blocks, _, left_bp, right_bp = cbout svelter_strings = toks[5].split('/') paths = [svelter_string_to_path(x, len(blocks)) for x in svelter_strings] score = float(toks[6]) this_data = (paths, blocks, left_bp, right_bp, score, 'PASS', id_extra, None, None) # no extra INFO/FORMAT tags like VCF vase data.append(this_data) log.write('skipped_seen\t{0}\n'.format(skipped_seen)) log.write('skipped_refgap\t{0}\n'.format(skipped_refgap)) do_sv_processing(data, outdir, reffile, log, verbosity) svelter.close() log.close()
def generic_vcf_convert(vcffile, outdir, reffile, filter_gaps=False, refgapfile=None, caller=None, flank_size=1000, verbosity=0): os.system('mkdir -p %s' % outdir) vcf = open(vcffile, 'r') log = open(os.path.join(outdir, 'convert_{0}.log'.format(vcffile)), 'w') data = [] svtype_skipped = {} seen_coords_count = {} skipped_refgap = 0 write_extra = False # need to write FORMAT or INFO to file? with open(vcffile, 'r') as vcf: toks_list = [line.rstrip().split('\t') for line in vcf if line[0] != '#'] if filter_gaps: chroms = set(toks[0] for toks in toks_list) chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms} else: chrom_gaps = None for toks in toks_list: # NOTE not parsing qual; do filtering beforehand for DELLY chrom, pos, id, ref, alt, qual, filterstring, info, format, sample1 = toks # VCF is 1-indexed, but specifies pos/end positions # which are to the left of breakpoints, so no adjustment pos = int(pos) tags = info.split(';') if 'PRECISE' in tags: filterstring += ':PRECISE' elif 'IMPRECISE' in tags: filterstring += ':IMPRECISE' elif caller == 'lumpy': # only includes tags for imprecise events filterstring += ':PRECISE' tags = [t for t in tags if '=' in t] tagd = {t.split('=')[0]: t.split('=')[1] for t in tags} end = int(tagd.get('END', -99999)) svtype = tagd['SVTYPE'] if caller == 'pindel' and svtype == 'INS': inslen = int(tagd['SVLEN']) else: inslen = int(tagd.get('INSLEN', 0)) if caller == 'pindel': homlen = int(tagd['HOMLEN']) if pos + homlen > end or svtype == 'INS': print('pos + homlen > end: positions {0}'.format((pos, end))) cipos = (0, 0) ciend = (0, 0) else: cipos = (0, homlen) ciend = (0, homlen) else: if 'CIPOS95' in tagd: # LUMPY tmp = tagd['CIPOS95'].split(',') cipos = (int(tmp[0]), int(tmp[1])) elif 'CIPOS' in tagd: tmp = tagd['CIPOS'].split(',') cipos = (int(tmp[0]), int(tmp[1])) else: cipos = (0, 0) if 'CIEND95' in tagd: # LUMPY tmp = tagd['CIEND95'].split(',') ciend = (int(tmp[0]), int(tmp[1])) elif 'CIEND' in tagd: tmp = tagd['CIEND'].split(',') ciend = (int(tmp[0]), int(tmp[1])) else: ciend = (0, 0) split_support = int(tagd.get('SR', 0)) pe_support = int(tagd.get('PE', 0)) # lumpy STRANDS only relevant for inversions if caller == 'lumpy' and svtype == 'INV': tmp = tagd['STRANDS'].split(',') tmpd = {a: b for (a, b) in (p.split(':') for p in tmp)} tagd['INV_PLUS'] = tmpd['++'] tagd['INV_MINUS'] = tmpd['--'] tagd_used = ('SR', 'PE', 'SVTYPE', 'SVMETHOD', 'END', 'STRANDS', 'SVLEN', 'HOMSEQ', 'CONSENSUS', 'CHR2') tagd_extra = {k: v for (k, v) in tagd.items() if k not in tagd_used} tags2 = {k: v for (k, v) in zip(format.split(':'), sample1.split(':'))} if 'AD' in tags2: # pindel split_support = int(tags2['AD'].split(',')[1]) gt = tags2['GT'] if gt == './.' or gt == '.|.': is_het = False filterstring += ':NOGT' elif gt in ('0/0', '0|0'): is_het = False filterstring += ':ZEROGT' elif gt in ('0/1', '1/0', '0|1', '1|0'): is_het = True else: assert(gt in ('1/1', '1|1')) is_het = False tags2_used = ('AD', 'SR', 'PE', 'SU') tags2_extra = {k: v for (k, v) in tags2.items() if k not in tags2_used} if len(tagd_extra) + len(tags2_extra) > 0: write_extra = True # cases if svtype == 'DEL': path = (0, 1, 4, 5) refpath = (0, 1, 2, 3, 4, 5) supptype = 'Del' elif svtype == 'INV': path = (0, 1, 3, 2, 4, 5) refpath = (0, 1, 2, 3, 4, 5) supptype = 'InvL' elif svtype == 'DUP' or svtype == 'DUP:TANDEM': path = (0, 1, 2, 3, 2, 3, 4, 5) refpath = (0, 1, 2, 3, 4, 5) supptype = 'Dup' elif svtype == 'INS': # INSERTIONS parse inslen, add insertion block to blocks path = (0, 1, 4, 5, 2, 3) refpath = (0, 1, 2, 3) supptype = 'Ins' else: # skipping delly TRA # skipping BND events as they may be ambiguous, in terms of the path svtype_skipped[svtype] = svtype_skipped.get(svtype, 0) + 1 continue # check ref gap overlap if filter_gaps and end > pos: sv_interval = pyinter.closedopen(pos, end) sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval]) if len(sv_gap_intersection) > 0: skipped_refgap += 1 continue # create breakpoints and blocks, keeping in mind uncertainty and possible insertion if caller == 'lumpy' and svtype != 'INS': # lumpy intervals are not symmetric. POS and END are each the "best guess" for # the breakpoints bp = [(pos, pos), (end, end)] elif svtype != 'INS': # if (cipos[1] != -cipos[0] or ciend[1] != -ciend[0]) and \ # (pos + cipos[1] < end + ciend[0]): if (pos + cipos[1] < end + ciend[0]): bp = [(pos + cipos[0], pos + cipos[1]), (end + ciend[0], end + ciend[1])] else: bp = [(pos, pos), (end, end)] filterstring += ':BPOVERLAP' else: # if cipos[1] != -cipos[0]: if cipos[1] > cipos[0]: bp = [(pos + cipos[0], pos + cipos[1])] else: bp = [(pos, pos)] pe = [(x, supptype) for x in range(pe_support)] # TODO SupportingSplit splits = [] for i in range(split_support): aln_tmp = pysam.AlignedSegment() aln_tmp.qname = i aln_tmp.is_read1 = True split_type = supptype + '+' splits.append(SupportingSplit(aln_tmp, None, None, None, None, split_type)) breakpoints = {x: Breakpoint(x, pe=pe, splits=splits) for x in bp} slop_left, slop_right = flank_size, flank_size start = bp[0][0] - slop_left end = bp[-1][1] + slop_right cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity) blocks, _, left_bp, right_bp = cbout if svtype == 'INS': blocks.append(GenomeInterval(chrom, 0, inslen, is_de_novo=True)) paths = [path, refpath] if is_het else [path, path] score = 0 coords = (start, end) scc = seen_coords_count.get(coords, 0) if scc > 0: id_extra = chr(ord('a') + scc) else: id_extra = '' seen_coords_count[coords] = scc + 1 this_data = (paths, blocks, left_bp, right_bp, score, filterstring, id_extra, tagd_extra, tags2_extra) data.append(this_data) for svtype, count in svtype_skipped.items(): log.write('skipped_svtype\t{0}\t{1}\n'.format(svtype, count)) log.write('skipped_refgap\t{0}\n'.format(skipped_refgap)) do_sv_processing(data, outdir, reffile, log, verbosity, write_extra) vcf.close() log.close()
def add_variants_to_set_from_bed(sample_alignment, bed_dataset): """ Given a bed with feature names and a corresponding sample alignment, create new variant sets for every unique feature name and assign variants to that fall within these features to the new sets. E.g. BED: ... NC_000913 223514 223534 POOR_MAPPING_QUALITY NC_000913 223542 223734 NO_COVERAGE NC_000913 223751 224756 POOR_MAPPING_QUALITY ... Add variants in 223542-223734 to NO_COVERAGE Add variants in 223751-224756 and 223514-223534 to POOR_MAPPING_QUALITY """ # Read in the bed file bed_dataset_fn = bed_dataset.get_absolute_location() reference_genome = sample_alignment.alignment_group.reference_genome experiment_sample = sample_alignment.experiment_sample # 1. Create a dictionary of disjoint intervals, recursive defaultdict feature_disj_intervals = defaultdict( lambda: defaultdict(pyinter.IntervalSet)) variants_to_add = defaultdict(list) with open(bed_dataset_fn) as bed_dataset_fh: for i, line in enumerate(bed_dataset_fh): try: chrom, start, end, feature = line.strip().split('\t') # make a new interval from start to end new_ivl = pyinter.closedopen(int(start), int(end)) # add new ivl to old ivls feature_disj_intervals[feature][chrom].add(new_ivl) except: print ('WARNING: Callable Loci line ' + '%d: (%s) couldnt be parsed.') % (i, line) # 2. Associate variants with these intervals variants = Variant.objects.filter( variantcallercommondata__alignment_group=\ sample_alignment.alignment_group) for v in variants: for feat, chrom_ivls in feature_disj_intervals.items(): # Skip if there is no interval in this chromosome if v.chromosome.label not in chrom_ivls: continue if not chrom_ivls[v.chromosome.label]: continue if v.position in chrom_ivls[v.chromosome.label]: variants_to_add[feat].append(v) # 3. Make new variant sets for any features with variants, # and add the variants to them. variant_set_to_variant_map = {} for feat, variants in variants_to_add.items(): (feat_variant_set, created) = VariantSet.objects.get_or_create( reference_genome=reference_genome, label=feat) grouped_uid_dict_list = [{ 'sample_uid': experiment_sample.uid, 'variant_uid': v.uid} for v in variants] variant_uid_to_obj_map = dict([(v.uid,v) for v in variants]) sample_uid_to_obj_map = {experiment_sample.uid: experiment_sample} _perform_add(grouped_uid_dict_list, feat_variant_set, variant_uid_to_obj_map, sample_uid_to_obj_map) variant_set_to_variant_map[feat_variant_set] = variants return variant_set_to_variant_map
def add_variants_to_set_from_bed(sample_alignment, bed_dataset): """ Given a bed with feature names and a corresponding sample alignment, create new variant sets for every unique feature name and assign variants to that fall within these features to the new sets. E.g. BED: ... NC_000913 223514 223534 POOR_MAPPING_QUALITY NC_000913 223542 223734 NO_COVERAGE NC_000913 223751 224756 POOR_MAPPING_QUALITY ... Add variants in 223542-223734 to NO_COVERAGE Add variants in 223751-224756 and 223514-223534 to POOR_MAPPING_QUALITY """ # Read in the bed file bed_dataset_fn = bed_dataset.get_absolute_location() reference_genome = sample_alignment.alignment_group.reference_genome experiment_sample = sample_alignment.experiment_sample # 1. Create a dictionary of disjoint intervals, recursive defaultdict feature_disj_intervals = defaultdict( lambda: defaultdict(pyinter.IntervalSet)) variants_to_add = defaultdict(list) with open(bed_dataset_fn) as bed_dataset_fh: for i, line in enumerate(bed_dataset_fh): try: chrom, start, end, feature = line.strip().split('\t') # make a new interval from start to end new_ivl = pyinter.closedopen(int(start), int(end)) # add new ivl to old ivls feature_disj_intervals[feature][chrom].add(new_ivl) except: print('WARNING: Callable Loci line ' + '%d: (%s) couldnt be parsed.') % (i, line) # 2. Associate variants with these intervals variants = Variant.objects.filter( variantcallercommondata__alignment_group=\ sample_alignment.alignment_group) for v in variants: for feat, chrom_ivls in feature_disj_intervals.items(): # Skip if there is no interval in this chromosome if v.chromosome.label not in chrom_ivls: continue if not chrom_ivls[v.chromosome.label]: continue if v.position in chrom_ivls[v.chromosome.label]: variants_to_add[feat].append(v) # 3. Make new variant sets for any features with variants, # and add the variants to them. variant_set_to_variant_map = {} for feat, variants in variants_to_add.items(): (feat_variant_set, created) = VariantSet.objects.get_or_create( reference_genome=reference_genome, label=feat) grouped_uid_dict_list = [{ 'sample_uid': experiment_sample.uid, 'variant_uid': v.uid } for v in variants] variant_uid_to_obj_map = dict([(v.uid, v) for v in variants]) sample_uid_to_obj_map = {experiment_sample.uid: experiment_sample} _perform_add(grouped_uid_dict_list, feat_variant_set, variant_uid_to_obj_map, sample_uid_to_obj_map) variant_set_to_variant_map[feat_variant_set] = variants return variant_set_to_variant_map
def create_blocks(breakpoints, gaps, chrom_name, start, end, verbosity): # create list of blocks between breakpoints # while adjusting for genome gaps gap_indices = set() gap_indices.add(0) blocks = [] left_breakpoints = [] right_breakpoints = [] breakpoints[(end, end)] = Breakpoint((end, end)) bploc = list(breakpoints.keys()) bploc.sort() last_end = start last_breakpoint = Breakpoint((start, start)) for bpl in bploc: breakpoint = breakpoints[bpl] if bpl[0] <= start or bpl[1] > end: continue iset = pyinter.IntervalSet() blockinterval = pyinter.closedopen(last_end, bpl[0]) iset.add(blockinterval) adjusted_blocks = iset.difference(gaps) adjusted_blocks = sorted(list(adjusted_blocks)) if verbosity > 1: print('bploc {0}'.format(bpl)) print('bp {0}'.format(breakpoint)) print('blockinterval {0}'.format(blockinterval)) print('adjusted {0}'.format(adjusted_blocks)) for ab in adjusted_blocks: if ab.lower_value == ab.upper_value: # block completely within a gap gap_indices.add(len(blocks)) break else: if ab.lower_value != blockinterval.lower_value: gap_indices.add(len(blocks)) left_breakpoint = Breakpoint( (ab.lower_value, ab.lower_value)) else: left_breakpoint = last_breakpoint if ab.upper_value != blockinterval.upper_value: gap_indices.add(len(blocks) + 1) right_breakpoint = Breakpoint( (ab.upper_value, ab.upper_value)) else: right_breakpoint = breakpoint if verbosity > 1: print('adding {0}'.format( GenomeInterval(chrom_name, ab.lower_value, ab.upper_value))) print('\tleft {0}'.format(left_breakpoint)) print('\tright {0}'.format(right_breakpoint)) blocks.append( GenomeInterval(chrom_name, ab.lower_value, ab.upper_value)) left_breakpoints.append(left_breakpoint) right_breakpoints.append(right_breakpoint) last_end = bpl[1] last_breakpoint = breakpoints[bpl] gap_indices.add(len(blocks)) gap_indices = sorted(list(gap_indices)) if verbosity > 1: print('--creating blocks--') print(breakpoints) print(blocks) print(gap_indices) print(left_breakpoints) print(right_breakpoints) return blocks, gap_indices, left_breakpoints, right_breakpoints
def test_add_variants_to_set_from_bed(self): common_entities = create_common_entities() project = common_entities['project'] self.ref_genome_1 = common_entities['reference_genome'] alignment_group = AlignmentGroup.objects.create( label='Alignment 1', reference_genome=self.ref_genome_1, aligner=AlignmentGroup.ALIGNER.BWA) (self.sample_1, created) = ExperimentSample.objects.get_or_create( project=project, label=SAMPLE_1_LABEL) sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group, experiment_sample=self.sample_1) # Create variants in the bed regions from best_test.bed for var_poor_map in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get( reference_genome=self.ref_genome_1), position=random.randint(101, 200), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={}) for var_no_cov in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get( reference_genome=self.ref_genome_1), position=random.randint(301, 400), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={}) variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get( reference_genome=self.ref_genome_1), position=random.randint(501, 600), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={}) new_bed_path = copy_dataset_to_entity_data_dir( entity=sample_alignment, original_source_location=TEST_BED) bed_dataset = add_dataset_to_entity( sample_alignment, dataset_label=Dataset.TYPE.BED_CALLABLE_LOCI, dataset_type=Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location=new_bed_path) vs_to_v_map = add_variants_to_set_from_bed(sample_alignment, bed_dataset) variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()]) self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']), variant_set_labels) for variant_set, variants in vs_to_v_map.items(): for v in variants: # POOR MAPPING QUAL should be from 101 to 200 if variant_set.label == 'POOR_MAPPING_QUALITY': self.assertTrue(v.position in pyinter.closedopen(101, 200)) # NO COVERAGE should be from 301 to 400, 501 to 600 elif variant_set.label == 'NO_COVERAGE': self.assertTrue(v.position in pyinter.IntervalSet([ pyinter.closedopen(301, 400), pyinter.closedopen(501, 600) ])) else: raise AssertionError('bad variant set %s made.' % variant_set.label)
def len_without_gaps(chrom_name, start, end, gapsfile): gaps = load_genome_gaps(gapsfile, chrom_name) region = pyinter.IntervalSet() region.add(pyinter.closedopen(start, end)) diff = region.difference(gaps) return sum(x.upper_value - x.lower_value for x in diff)
def test_add_variants_to_set_from_bed(self): common_entities = create_common_entities() project = common_entities['project'] self.ref_genome_1 = common_entities['reference_genome'] alignment_group = AlignmentGroup.objects.create( label='Alignment 1', reference_genome=self.ref_genome_1, aligner=AlignmentGroup.ALIGNER.BWA) (self.sample_1, created) = ExperimentSample.objects.get_or_create( project=project, label=SAMPLE_1_LABEL) sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group, experiment_sample=self.sample_1) # Create variants in the bed regions from best_test.bed for var_poor_map in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1), position=random.randint(101,200), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={} ) for var_no_cov in range(20): variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1), position=random.randint(301,400), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={} ) variant = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.ref_genome_1, chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1), position=random.randint(501,600), ref_value='A') vccd = VariantCallerCommonData.objects.create( variant=variant, source_dataset_id=1, alignment_group=alignment_group, data={} ) new_bed_path = copy_dataset_to_entity_data_dir( entity= sample_alignment, original_source_location= TEST_BED) bed_dataset = add_dataset_to_entity(sample_alignment, dataset_label= Dataset.TYPE.BED_CALLABLE_LOCI, dataset_type= Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location= new_bed_path) vs_to_v_map = add_variants_to_set_from_bed( sample_alignment, bed_dataset) variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()]) self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']), variant_set_labels) for variant_set, variants in vs_to_v_map.items(): for v in variants: # POOR MAPPING QUAL should be from 101 to 200 if variant_set.label == 'POOR_MAPPING_QUALITY': self.assertTrue(v.position in pyinter.closedopen( 101, 200)) # NO COVERAGE should be from 301 to 400, 501 to 600 elif variant_set.label == 'NO_COVERAGE': self.assertTrue(v.position in pyinter.IntervalSet([ pyinter.closedopen(301,400), pyinter.closedopen(501,600)])) else: raise AssertionError( 'bad variant set %s made.' % variant_set.label)