def output_adjacencies(self, adjs, out_file, format, header=None): """Output adjacencies in tsv format Args: adjs: (List) Adjacencies out_file: (str) absolute path of output file format: (str) either "tab" or "bedpe" header: (str) header string """ fn = None args = () if format == 'bedpe': fn = 'as_bedpe' elif format == 'tab': fn = 'as_tab' if not fn is None: out = open(out_file, 'w') if header is not None: out.write(header + '\n') if format == 'tab': out.write('%s\n' % Adjacency.show_tab_headers()) elif format == 'bedpe': out.write('%s\n' % Adjacency.show_bedpe_headers()) for adj in adjs: output = getattr(adj, fn)(*args) try: out.write('%s\n' % output) except: sys.stdout.write("can't output Adjacency") out.close()
def create_variants(self, adjs): def track_adjs(used_ids, variants): if variants: for variant in variants: for adj in variant.adjs: used_ids.add(adj.id) #print 'used', used_ids """Creates variants from adjacencies""" self.variants = [] adjs_ids_used = Set() split_events = [adj for adj in adjs if not adj.rearrangement in ('trl', 'ins') and adj.align_types[0] == 'split'] ins_variants, split_events_remained = Adjacency.extract_interchrom_ins(split_events) self.variants.extend(ins_variants) track_adjs(adjs_ids_used, ins_variants) # special cases for imprecise insertions ins_variants, ins_adjs = Adjacency.extract_imprecise_ins([adj for adj in adjs if adj.align_types[0] == 'split' and adj.rearrangement != 'inv' and not adj.id in adjs_ids_used], debug=self.debug) self.variants.extend(ins_variants) track_adjs(adjs_ids_used, ins_variants) # handle inversions invs = [adj for adj in adjs if adj.rearrangement == 'inv' and not adj.id in adjs_ids_used] inv_variants = Adjacency.group_inversions(invs) self.variants.extend(inv_variants) track_adjs(adjs_ids_used, inv_variants) # convert translocations to insertions trls = [adj for adj in adjs if adj.rearrangement == 'trl' and not adj.id in adjs_ids_used] ins_variants, trls_remained = Adjacency.extract_interchrom_ins(trls) self.variants.extend(ins_variants) track_adjs(adjs_ids_used, ins_variants) # group reciprocal transcloations trls = [adj for adj in adjs if adj.rearrangement == 'trl' and not adj.id in adjs_ids_used] reciprocal_trls, trls_remained = Adjacency.group_trls(trls) self.variants.extend(reciprocal_trls) track_adjs(adjs_ids_used, reciprocal_trls) # append remaining non-dubious translocations trls = [adj for adj in adjs if adj.rearrangement == 'trl' and not adj.id in adjs_ids_used] for trl in trls: if not trl.dubious: variant = Variant('TRL', [trl]) for adj in adjs: if not adj.id in adjs_ids_used and not adj.dubious: self.variants.append(Variant(adj.rearrangement.upper(), [adj]))
def screen_realigns(self, use_realigns=False): """Realign probe sequences of adjacencies and screen results - genome, and index_dir must have been set when object is initialized - output is always set to "realign.fa" and "realign.bam" - will fail Adjacency if probe sequence can align to single location """ if not self.genome or not self.index_dir: return None name_sep = '.' all_adjs = [] for variant in self.variants: all_adjs.extend(variant.adjs) realign_bam_file = Adjacency.realign(all_adjs, self.out_dir, probe=True, contigs_fasta=self.contig_fasta, name_sep=name_sep, genome=self.genome, index_dir=self.index_dir, num_procs=self.num_procs, use_realigns=use_realigns, ) try: bam = pysam.Samfile(realign_bam_file, 'rb') except: sys.exit('Error parsing realignment BAM:%s' % realign_bam_file) # creates mapping from query to variant and Adjacency query_to_variant = {} for i in range(len(self.variants)): for j in range(len(self.variants[i].adjs)): adj = self.variants[i].adjs[j] query = adj.contigs[0] + name_sep + adj.key() query_to_variant[query] = (i, j) failed_variants = Set() for key, group in groupby(bam.fetch(until_eof=True), lambda x: name_sep.join(x.qname.split(name_sep)[:2])): alns = list(group) variant_idx = query_to_variant[key][0] variant = self.variants[variant_idx] adj_idx = query_to_variant[key][1] adj = variant.adjs[adj_idx] adj_aligns = adj.aligns[0] indices_to_check = (0, 1) if variant.event == 'INS': index = None for i in (0, 1): if variant.chrom == adj.chroms[i] and (variant.pos[0] == adj.breaks[i] or variant.pos[1] == adj.breaks[i]): index = i break if index is not None: indices_to_check = (index,) probe_alns = [aln for aln in alns if not aln.qname[-1].isdigit()] if not gapped_align.screen_probe_alns(adj_aligns, probe_alns, adj.align_types[0]): if self.debug: sys.stdout.write('probe align completely to one location or not aligned with confidence: %s\n' % key) failed_variants.add(variant) continue for failed_var in failed_variants: self.variants.remove(failed_var)
def find_adjs(self, min_ctg_cov, max_size=None, min_size=None, ins_as_ins=False, skip_acen=False, check_alt_paths=False, min_ctg_size=0, bad_coords=None, skip_contigs_file=None): """Main method to go through the BAM file, extract split and gapped alignments, and calls the respective modules to identify adjs""" def find_events_in_single_align(align): """Implement as sub-function so that small-scale events can be found on split alignments too""" adjs = gapped_align.find_adjs(align, contig_seq, False, ins_as_ins=ins_as_ins, query_fasta=self.contig_fasta, target_fasta=self.ref_fasta) repeats = Set() for i in range(len(adjs)): adj = adjs[i] if self.skip_simple_repeats and self.break_region_has_low_complexity(adj.chroms[0], adj.breaks): repeats.add(i) if self.debug: sys.stdout.write("remove contig %s %s potential simple-repeat %s:%s-%s\n" % (adj.contigs[0], adj.rearrangement, adj.chroms[0], adj.breaks[0], adj.breaks[1])) continue # seems unnecessary #new_contig_breaks = self.expand_contig_breaks(adj.chroms[0], adj.breaks, contig, adj.contig_breaks[0], adj.rearrangement, self.debug) #if new_contig_breaks is not None: #adj.contig_breaks[0] = new_contig_breaks if repeats: for i in sorted(repeats, reverse=True): del adjs[i] return adjs def is_align_in_acen(align, acen): """Checks to see if alignment overlaps with acentromeric coordinates Args: align: alignment (Alignment) acen: acentromeric coordinates parsed from UCSC cytobands file (Dictionary) {chrom:(start, end), (start, end)} Returns True if overlapped """ s1, e1 = align.tstart, align.tend if acen.has_key(align.target): for (start, end) in acen[align.target]: s2, e2 = int(start) - self.acen_buffer, int(end) + self.acen_buffer if s1 <= e2 and s2 <= e1: return True return False def create_set(list_file): """Creates set from items in a list""" subset = Set() for line in open(list_file, 'r'): subset.add(line.strip('\n')) return subset acen_coords = None if skip_acen: acen_coords = get_acen_coords(self.cytobands_file) skip_contigs = None if skip_contigs_file and os.path.exists(skip_contigs_file): skip_contigs = create_set(skip_contigs_file) all_adjs = [] for contig, group in groupby(self.bam.fetch(until_eof=True), lambda x: x.qname): print 'contig', contig alns = list(group) contig_seq = self.contig_fasta.fetch(contig) if len(contig_seq) < min_ctg_size: if self.debug: sys.stdout.write('%s(%d bp) less than min contig size %d bp\n' % (contig, len(contig_seq), min_ctg_size)) continue if skip_contigs and contig in skip_contigs: if self.debug: sys.stdout.write('%s skipped\n' % contig) continue if len(alns) > 1: chimeric_aligns, dubious = split_align.find_chimera(alns, self.bam, min_coverage=min_ctg_cov, check_alt_paths=check_alt_paths, debug=self.debug) if chimeric_aligns: if acen_coords: skip = False for align in chimeric_aligns: if acen_coords and is_align_in_acen(align, acen_coords): if self.debug: sys.stdout.write('skip contig %s because alignment is in centromere %s:%d-%d\n' % (contig, align.target, align.tstart, align.tend )) skip = True break if skip: continue adjs = split_align.find_adjs(chimeric_aligns, contig_seq, dubious=dubious, debug=self.debug) bad = Set() for i in range(len(adjs)): adj = adjs[i] #check if homol is simple repeat if adj.homol_seq and adj.homol_seq[0] != '-' and self.is_homol_low_complexity(adj): if self.debug: sys.stdout.write("homol_seq is simple-repeat %s:%s\n" % (adj.contigs[0], adj.homol_seq[0])) bad.add(i) # check if event is simple repeat expansions if self.skip_simple_repeats and self.is_novel_sequence_repeat(adj): if self.debug: sys.stdout.write("novel_seq is simple-repeat %s:%s\n" % (adj.contigs[0], adj.novel_seq)) bad.add(i) # inversion with size of 1 if adj.rearrangement == 'inv' and adj.get_size() <= 1: if self.debug: sys.stdout.write("inversion with unreasonable size %s:%d %s:%d-%d\n" % (adj.contigs[0], adj.get_size(), adj.chroms[0], adj.breaks[0], adj.breaks[1])) bad.add(i) if i > 0: if adjs[i].chroms == adjs[i - 1].chroms and\ adjs[i].breaks == adjs[i - 1].breaks and\ adjs[i].orients == adjs[i].orients and\ adjs[i].contig_breaks != adjs[i - 1].contig_breaks: if self.debug: sys.stdout.write("%s has 2 contig_breaks for same event\n" % adj.contigs[0]) bad.add(i - 1) bad.add(i) if bad: for i in sorted(bad, reverse=True): del adjs[i] all_adjs.extend(adjs) # capture small-scale events within each chimeric alignment for align in chimeric_aligns: all_adjs.extend(find_events_in_single_align(align)) best_align = gapped_align.find_single_unique(alns, self.bam, debug=self.debug) if best_align: all_adjs.extend(find_events_in_single_align(best_align)) merged_adjs = Adjacency.merge(all_adjs) # screen out adjacencies that overlap segdups if bad_coords is not None and os.path.exists(bad_coords): self.screen_by_coordinate(merged_adjs, bad_coords) # size filtering if max_size is not None or min_size is not None: selected = [] for adj in merged_adjs: size = adj.get_size() if max_size is not None and\ min_size is not None: if type(size) is int and\ size >= min_size and size <= max_size: selected.append(adj) elif max_size is not None: if type(size) is int and\ size <= max_size: selected.append(adj) elif min_size is not None: if type(size) is not int or\ size >= min_size: selected.append(adj) return selected else: return merged_adjs