def samples_overlap(recA, recB, upper_thresh=0.5, lower_thresh=0.5): """ Report if the samples called in two VCF records overlap sufficiently. The fraction of each record's samples which are shared with the other record is calculated. The record with a greater fraction of shared samples must exceed the upper threshold AND the record with a lesser fraction of shared samples must exceed the lower threshold. This is intended to maximize sensitivity in rare variants with a false negative in one breakpoint. Parameters ---------- recA : pysam.VariantRecord recB : pysam.VariantRecord upper_thresh : float, optional Minimum sample overlap in record with greater overlap lower_thresh : float, optional Minimum sample overlap in record with lesser overlap Returns ------- samples_overlap : bool Samples shared between records meet required thresholds. """ # Get lists of called samples for each record samplesA = set(svu.get_called_samples(recA)) samplesB = set(svu.get_called_samples(recB)) # Compute fraction of each record's samples which are shared if len(samplesA) > 0 and len(samplesB) > 0: shared = samplesA & samplesB fracA = len(shared) / len(samplesA) fracB = len(shared) / len(samplesB) min_frac, max_frac = sorted([fracA, fracB]) else: min_frac, max_frac = [0, 0] return min_frac >= lower_thresh and max_frac >= upper_thresh
def samples_overlap_records(recA, recB, called_samples_dict, upper_thresh=0.5, lower_thresh=0.5): if recA.id not in called_samples_dict: called_samples_dict[recA.id] = set(svu.get_called_samples(recA)) if recB.id not in called_samples_dict: called_samples_dict[recB.id] = set(svu.get_called_samples(recB)) return samples_overlap(called_samples_dict[recA.id], called_samples_dict[recB.id], upper_thresh=upper_thresh, lower_thresh=lower_thresh)
def samples_overlap(recordA, recordB, upper_thresh=0.8, lower_thresh=0.5): # Get lists of called samples for each record samplesA = set(svu.get_called_samples(recordA)) samplesB = set(svu.get_called_samples(recordB)) # Compute fraction of each record's samples which are shared shared = samplesA & samplesB fracA = len(shared) / len(samplesA) fracB = len(shared) / len(samplesB) min_frac, max_frac = sorted([fracA, fracB]) return min_frac >= lower_thresh and max_frac >= upper_thresh
def filter_dn_variants(vcf, filterfile, fam, fout): """ Add parent false negatives and remove child false positives from dn filter Arguments --------- vcf : pysam.VariantFile filterfile : file fout : pysam.VariantFile """ # Get dictionaries of samples to add and remove from each variant add, remove = parse_filtered(filterfile, fam) for record in vcf: # Write records unaltered if they weren't included in the de novo check if record.id not in add.keys() and record.id not in remove.keys(): fout.write(record) # Otherwise set samples appropriately else: for sample in add.get(record.id, []): record.samples[sample]['GT'] = (0, 1) for sample in remove.get(record.id, []): set_null(record, sample) # Only report record if any samples made it through de novo check if len(svu.get_called_samples(record)) > 0: fout.write(record)
def choose_background(self, record, whitelist=None, blacklist=None): # Select called and background samples called = svu.get_called_samples(record) background = [s for s in self.samples if s not in called] # Permit override of specified white/blacklists whitelist = whitelist if whitelist is not None else self.whitelist blacklist = blacklist if blacklist is not None else self.blacklist def _filter_whitelist(samples): return [s for s in samples if s in whitelist] def _filter_blacklist(samples): return [s for s in samples if s not in blacklist] called = _filter_whitelist(called) background = _filter_whitelist(background) called = _filter_blacklist(called) background = _filter_blacklist(background) if len(background) >= self.n_background: background = np.random.choice(background, self.n_background, replace=False).tolist() return called, background
def sfari_filters(vcf): for record in vcf: called = set(svu.get_called_samples(record)) # Remove allosomal calls in aneuploidy samples if record.chrom in 'X Y'.split(): if called.issubset(ANEUPLOIDIES): continue else: for sample in called.intersection(ANEUPLOIDIES): svu.set_null(record, sample) # Remove depth-only events in Robertsonian translocation cases if record.info['SOURCES'] == ('depth', ): if called.issubset(DEPTH_EXCLUDED): continue # Check variant wasn't only in aneuploidies and Robertsonians # called = set(svu.get_called_samples(record)) # if len(called) == 0: # continue # Remove variants specific to dosage outliers # if called.issubset(OUTLIERS): # continue yield record
def scrape_sample_stats(self, record): name = record.id chrom = record.chrom svtype = record.info['SVTYPE'] if record.info['ALGORITHMS'] == ('depth', ): source = 'depth-only' elif 'depth' in record.info['ALGORITHMS']: source = 'pesr+depth' else: source = 'pesr-only' called = svu.get_called_samples(record) inh_status = get_inh(called, self.fam) inh_map = {} for status, samples in inh_status.items(): for s in samples: inh_map[s] = status fmt = ('{sample}\t{name}\t{chrom}\t{svtype}\t{source}\t{inh}\n') for sample in called: if self.fam.samples[sample].has_parents: inh = inh_map[sample] else: inh = 'parent' self.obs_fout.write(fmt.format(**locals()))
def _cache_sample_overlap(record, force=False): if force or record.id not in sample_overlap_cache: _samples = svu.get_called_samples(record) sample_overlap_cache[record.id] = set(sample_id_to_index_dict[s] for s in _samples) return _samples else: return sample_overlap_cache[record.id]
def is_pilot_denovo(svrecord): record = svrecord.record is_pilot = record.id.startswith('Pilot') called = svu.get_called_samples(record) is_private = len(called) == 1 is_child = called[0].endswith('p1') or called[0].endswith('s1') return is_pilot and is_private and is_child
def link_cpx(vcf, bkpt_window=300): """ Parameters ---------- vcfpath : str Path to breakpoint VCF """ bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False) # Identify breakpoints which overlap within specified window overlap = bt.window(bt, w=bkpt_window).saveas() # Exclude intersections where two DELs or two DUPs cluster together overlap = overlap.filter(lambda b: not (b.fields[4] == "DEL" and b.fields[ 10] == "DEL")).saveas() overlap = overlap.filter(lambda b: not (b.fields[4] == "DUP" and b.fields[ 10] == "DUP")).saveas() # Get linked variant IDs links = [(b[3], b[9]) for b in overlap.intervals] linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links))) linked_IDs = np.array(linked_IDs) # Map variant IDs to indices bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)} indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links]) # Extract VariantRecords corresponding to breakpoints n_bkpts = len(linked_IDs) bkpts = extract_breakpoints(vcf, bkpt_idxs) # Build called sample index # Get lists of called samples for each record sample_sets_dict = { idx: set(svu.get_called_samples(bkpts[idx])) for idx in set(indexed_links.flatten().tolist()) } # Exclude wildly disparate overlaps # Build sparse graph from links G = sps.eye(n_bkpts, dtype=np.uint16, format='lil') for i, j in indexed_links: if (samples_overlap(sample_sets_dict[i], sample_sets_dict[j]) and close_enough(bkpts[i], bkpts[j])): G[i, j] = 1 # Generate lists of clustered breakpoints n_comp, comp_list = sps.csgraph.connected_components(G) clusters = [deque() for x in range(n_comp)] for i, c_label in enumerate(comp_list): clusters[c_label].append(bkpts[i]) return clusters
def merge_nested_depth_record(pesr_record, depth_record): """Add samples from nested depth record to PE/SR record""" pesr_record.info['MEMBERS'] = (pesr_record.info['MEMBERS'] + (depth_record.id, )) pesr_record.info['SOURCES'] = pesr_record.info['SOURCES'] + ('depth', ) def _quad(s): return s.strip('.')[0] depth_samples = svu.get_called_samples(depth_record) pesr_samples = svu.get_called_samples(pesr_record) # If a sample is called in both the pe/sr record and the nested depth # record, move any relatives called in the depth record to the pe/sr record for quad, samples in itertools.groupby(depth_samples, _quad): samples = list(samples) if any([s in pesr_samples for s in samples]): for sample in samples: svu.set_null(depth_record, sample) pesr_record.samples[sample]['GT'] = (0, 1)
def to_bed(record, cluster): fmt = ('{chrom}\t{start}\t{end}\t{name}\t{svtype}\t{samples}\t{sources}\t' '{batch}\t{cluster}\n') batch = 'Pilot' if 'Pilot' in record.id else 'Phase1' return fmt.format(chrom=record.chrom, start=record.pos, end=record.stop, name=record.id + '__' + record.chrom, svtype=record.info['SVTYPE'], samples=','.join(svu.get_called_samples(record)), sources=','.join(record.info['SOURCES']), batch=batch, cluster=cluster)
def overlap_fail(phase1, pilot, header, dist=300, frac=0.1): svc = VCFCluster([phase1, pilot], dist=dist, frac=frac, preserve_ids=True) for cluster in svc.cluster(merge=False): # Keep any pilot de novo variants other_records = [] for record in cluster.records: if is_pilot_denovo(record): yield make_new_record(SVRecordCluster([record]), header) else: other_records.append(record) if len(other_records) > 0: cluster.records = other_records else: continue # skip phase1-only variants and pilot variants that overlap with # rejected phase1 variants if is_batch_only(cluster, 'Pilot'): records = cluster.records depth_only = all([is_depth_only(r) for r in records]) if len(records) == 1: record = make_new_record(cluster, header) yield record # check that we're not overclustering Pilot variants elif len(records) == 2: samples = [svu.get_called_samples(r.record) for r in records] if samples_overlap(*samples) or depth_only: record = make_new_record(cluster, header) yield record else: for svrecord in cluster.records: newcluster = SVRecordCluster([svrecord]) record = make_new_record(newcluster, header) yield record elif depth_only: record = make_new_record(cluster, header) yield record else: import ipdb ipdb.set_trace() raise Exception('Multiple Pilot variants clustered')
def choose_background(self, record, candidates): # Exclude called samples and all candidate families from background quads = sorted(set([s.split('.')[0] for s in candidates])) members = 'fa mo p1 s1'.split() related = ['.'.join(s) for s in itertools.product(quads, members)] called = set(svu.get_called_samples(record)) blacklist = called.union(related) background = [s for s in self.samples if s not in blacklist] if len(background) >= self.n_background: background = np.random.choice(background, self.n_background, replace=False).tolist() return background
def process_metadata(variants, bed=False, batch_list=None): if bed: samples = [s.strip() for s in batch_list.readlines()] else: samples = list(variants.header.samples) parents = [s for s in samples if _is_parent(s)] children = [s for s in samples if _is_child(s)] n_parents = len(parents) n_children = len(children) metadata = deque() for variant in variants: # bed record if bed: if variant.startswith('#'): continue data = variant.strip().split() called = data[4].split(',') name = data[3] svtype = data[5] # VCF record else: called = svu.get_called_samples(variant) name = variant.id svtype = variant.info['SVTYPE'] # Calculate parental VF parents = [s for s in called if _is_parent(s)] parental_vf = len(parents) / n_parents children = [s for s in called if _is_child(s)] child_vf = len(children) / n_children if child_vf > 0: inh_rate = get_inh_rate(called) else: inh_rate = 0 dat = [name, svtype, parental_vf, child_vf, inh_rate] metadata.append(dat) metadata = np.array(metadata) cols = 'name svtype parental_vf child_vf inh_rate'.split() metadata = pd.DataFrame(metadata, columns=cols) return metadata
def get_denovo_candidates(record, fam, max_parents=10): """ Obtain list of samples which are putatively called de novo """ called = svu.get_called_samples(record) parents = [s for s in called if fam.samples[s].is_parent] if len(parents) > max_parents: return [] denovo = [] for ID in called: sample = fam.samples[ID] if sample.has_parents: if sample.mother not in called and sample.father not in called: denovo.append(sample.ID) return denovo
def run(self): for record in self.vcf: # Skip records without any Mendelian violations candidates = get_denovo_candidates(record, self.max_parents) if len(candidates) == 0: continue # Restrict to rare (parental VF<0.1%) variants c = svu.get_called_samples(record) parents = [s for s in c if s.endswith('fa') or s.endswith('mo')] if len(parents) > self.max_parents: continue # Skip non-stranded (wham) if record.info['STRANDS'] not in '+- -+ ++ --'.split(): continue self.test_record(record)
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf') parser.add_argument('dn_filter') parser.add_argument('fout') args = parser.parse_args() vcf = pysam.VariantFile(args.vcf) if args.fout in '- stdout'.split(): fout = pysam.VariantFile(sys.stdout, 'w', header=vcf.header) else: fout = pysam.VariantFile(args.fout, 'w', header=vcf.header) dn_filter = pd.read_table(args.dn_filter) dn_filter = dn_filter.pivot_table(index='name', columns='operation', values='sample', aggfunc=lambda s: ','.join(s)) for record in vcf: if record.id not in dn_filter.index: fout.write(record) continue # Add false negative parents parents = dn_filter.loc[record.id, 'add'] if parents is not None: for sample in parents.split(','): record.samples[sample]['GT'] = (0, 1) # Remove false positive children children = dn_filter.loc[record.id, 'remove'] if children is not None: for sample in children.split(','): svu.set_null(record, sample) # Skip variant if the child was only sample called = svu.get_called_samples(record) if len(called) == 0: continue fout.write(record)
def scrape_record_stats(record, fam): """ record : pysam.VariantRecord sample_keys : dict of {str: list of str} {batch: [samples]} """ name = record.id svtype = record.info['SVTYPE'] # svsize = record.stop - record.pos svsize = record.info['SVLEN'] algorithms = ','.join(record.info['ALGORITHMS']) called = svu.get_called_samples(record) parents = [s for s in called if not fam.samples[s].has_parents] children = [s for s in called if fam.samples[s].has_parents] n_called = len(called) n_parents = len(parents) n_children = len(children) homs = [s for s in called if record.samples[s]['GT'] == (1, 1)] hom_parents = [s for s in homs if s in parents] hets = [s for s in called if record.samples[s]['GT'] == (0, 1)] n_homs = len(homs) n_hets = len(hets) inh_status = get_inh(called, fam) n_denovo = len(inh_status['denovo']) n_maternal = len(inh_status['maternal']) n_paternal = len(inh_status['paternal']) n_biparental = len(inh_status['biparental']) chrom, start, end = record.chrom, record.pos, record.stop statline = ('{chrom}\t{start}\t{end}\t' '{name}\t{svtype}\t{svsize}\t{algorithms}\t' '{n_called}\t{n_parents}\t{n_children}\t' '{n_homs}\t{n_hets}\t' '{n_denovo}\t{n_maternal}\t{n_paternal}\t{n_biparental}') statline = statline.format(**locals()) return statline
def filter_denovo_records(vcf, fam, max_parents=10): for record in vcf: # Skip records without any Mendelian violations candidates = get_denovo_candidates(record, fam, max_parents) if len(candidates) == 0: continue # Restrict to rare (parental VF<0.1%) variants called = svu.get_called_samples(record) parents = [s for s in called if fam.samples[s].is_parent] if len(parents) > max_parents: continue # Skip non-stranded (wham) if 'STRANDS' in record.info.keys(): if record.info['STRANDS'] not in '+- -+ ++ --'.split(): continue yield record
def get_denovo_candidates(record, max_parents=20): """ Obtain list of samples which are putatively called de novo """ called = svu.get_called_samples(record) parents = [s for s in called if s.endswith('fa') or s.endswith('mo')] if len(parents) > max_parents: return [] denovo = [] for quad, samples in itertools.groupby(called, lambda s: s.split('.')[0]): # Add putative de novo calls samples = list(samples) members = [s.split('.')[1] for s in samples] if 'fa' not in members and 'mo' not in members: denovo += samples return denovo
def count_svtypes(vcf): """ Count instances of each SVTYPE in each sample in a VCF. Parameters ---------- vcf : pysam.VariantFile Returns ------- counts : pd.DataFrame Columns: sample, svtype, count """ samples = list(vcf.header.samples) # Initialize counts per sample - each dict is keyed on svtype count_dict = {} for sample in samples: count_dict[sample] = defaultdict(int) for record in vcf: for sample in svu.get_called_samples(record): # Count the SVTYPE if it's present, otherwise increment NO_SVTYPE if 'SVTYPE' in record.info.keys(): count_dict[sample][record.info['SVTYPE']] += 1 else: count_dict[sample]['NO_SVTYPE'] += 1 # Convert to dataframe, adding zeros to samples with no instances of a # given svtype counts = pd.DataFrame.from_dict(count_dict, orient='index')\ .fillna(0).astype(int)\ .reset_index().rename(columns={'index': 'sample'}) # Tidy data from "column-per-svtype" format counts = pd.melt(counts, id_vars=['sample'], var_name='svtype', value_name='count') return counts
def select_mosaic_candidates(vcf, metrics, mode, cutoffs): candidates = metrics.loc[metrics.svtype.isin('DEL DUP'.split())] candidates = candidates.loc[candidates.svsize >= 5000] candidates = candidates.loc[candidates.poor_region_cov < 0.3] if mode == 'pesr': candidates = candidates.loc[candidates.RD_log_pval >= cutoffs.min_pval] candidates = candidates.loc[candidates.RD_log_2ndMaxP >= cutoffs.min_secondp] else: pval_filter = (((candidates.svtype == 'DEL') & (candidates.RD_log_pval >= cutoffs.del_min_pval)) | ((candidates.svtype == 'DUP') & (candidates.RD_log_pval >= cutoffs.dup_min_pval))) candidates = candidates.loc[pval_filter] candidate_IDs = candidates.name.values for record in vcf: called = svu.get_called_samples(record) if len(called) == 1 and record.id in candidate_IDs and record.info['SVTYPE'] in 'DEL DUP'.split(): yield record
def from_vcf(cls, record, whitelist=None): """ Parameters ---------- record : pysam.VariantRecord """ chrA = record.chrom posA = record.pos chrB = record.info['CHR2'] posB = record.stop name = record.id strands = record.info['STRANDS'] samples = svu.get_called_samples(record) if whitelist is not None: samples = [s for s in samples if s in whitelist] return cls(chrA, posA, chrB, posB, name, samples, strands)
def choose_background(self, record, candidates): # Exclude called samples and all candidate families from background related = [c for c in candidates] for s in candidates: if self.fam.samples[s].has_parents: related.append(self.fam.samples[s].father) related.append(self.fam.samples[s].mother) related += self.fam.samples[self.fam.samples[s].father].children called = set(svu.get_called_samples(record)) blacklist = called.union(related) background = [s for s in self.samples if s not in blacklist] if len(background) >= self.n_background: background = np.random.choice(background, self.n_background, replace=False).tolist() return background
def __init__(self, record): self.id = record.id if 'EVIDENCE' in record.info: ev = set(record.info['EVIDENCE']) else: ev = set() if 'PE' in ev and 'SR' in ev and 'RD' in ev: self.level_of_support = 1 elif 'PE' in ev and 'RD' in ev: self.level_of_support = 2 elif 'PE' in ev and 'SR' in ev: self.level_of_support = 3 elif 'RD' in ev and 'SR' in ev: self.level_of_support = 4 elif 'PE' in ev: self.level_of_support = 5 elif 'RD' in ev: self.level_of_support = 6 elif 'SR' in ev: self.level_of_support = 7 elif len(ev) == 0: self.level_of_support = 8 else: raise ValueError("Uninterpretable evidence: {}".format(ev)) if record.id in bothside_pass: self.both_end_support = bothside_pass[record.id] else: self.both_end_support = 0 self.sr_fail = record.id in background_fail self.is_bnd = record.info['SVTYPE'] == 'BND' self.vargq = record.info['varGQ'] self.called_samples = [ sample_id_to_idx[s] for s in svu.get_called_samples(record) ] self.freq = len(self.called_samples) self.length = record.info['SVLEN'] self.gt_50bp = self.length >= 50 self.is_mei = 'melt' in record.info['ALGORITHMS']
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Input vcf (supports "stdin").') parser.add_argument('minGQtable', help='Tab-delimited minGQ filtering lookup' + ' table generated by create_minGQ_lookup_table.R.') parser.add_argument('fout', help='Output file (supports "stdout").') parser.add_argument('-m', '--minGQ', help='Global min GQ', type=int, default=0, dest='globalMin') parser.add_argument('--multiallelics', default=False, action='store_true', help='Also apply filtering to multiallelic sites ' + '(default: skip multiallelics).') parser.add_argument('--dropEmpties', default=False, action='store_true', help='After GT reassignments, drop any SV with no remaining ' + ' non-ref samples (default: keep all SV).') parser.add_argument('--simplify-INS-SVTYPEs', default=False, action='store_true', help='Resets the SVTYPE of all INS variants, including MEIs, ' + 'to be SVTYPE=INS (default: keep original SVTYPEs).') parser.add_argument('--maxNCR', help='Max no-call rate among all ' + 'samples before adding a flag to the record\'s FILTER field' + ' (default: 0.005)', type=float, default=0.005, dest='maxNCR') parser.add_argument('--cleanAFinfo', help='Remove all AF-related terms from ' + ' the INFO field and VCF header (default: keep all terms).', default=False, action='store_true') parser.add_argument('--prefix', help='Cohort label to append to NCR FILTER.', default='COHORT', dest='prefix') args = parser.parse_args() if args.vcf in '- stdin'.split(): vcf = pysam.VariantFile(sys.stdin) else: vcf = pysam.VariantFile(args.vcf) #Add HIGH_NOCALL_RATE filter to vcf header NEW_FILTER = '##FILTER=<ID=HIGH_{0}_NOCALL_RATE,Description="More than '.format(args.prefix) + \ '{:.2%}'.format(args.maxNCR) + ' of {0} sample GTs were '.format(args.prefix) + \ 'masked as no-call GTs due to low GQ. Indicates a possibly noisy locus ' + \ 'in {0} samples.>'.format(args.prefix) header = vcf.header header.add_line(NEW_FILTER) filter_text = 'HIGH_{0}_NOCALL_RATE'.format(args.prefix) if args.fout in '- stdout'.split(): fout = pysam.VariantFile(sys.stdout, 'w', header=vcf.header) else: fout = pysam.VariantFile(args.fout, 'w', header=vcf.header) #Make dummy lookup tables for SVLEN, AF, SVTYPE, FILTER, and EV SVLEN_table = _make_SVLEN_interval_dict(args.minGQtable) AF_table = _make_AF_interval_dict(args.minGQtable) SVTYPE_table = _make_SVTYPE_dict(args.minGQtable) FILTER_table = _make_FILTER_dict(args.minGQtable, vcf) EV_table = _make_EV_dict(args.minGQtable) #Make minGQ lookup table minGQ_dict = make_minGQ_dict(args.minGQtable, SVLEN_table, AF_table, SVTYPE_table, FILTER_table, EV_table) #Iterate over records in vcf and apply filter for record in vcf.fetch(): #Do not process multiallelic variants, unless optioned if args.multiallelics or \ (not args.multiallelics and not _is_multiallelic(record)): apply_minGQ_filter(record, minGQ_dict, SVLEN_table, AF_table, SVTYPE_table, FILTER_table, EV_table, globalMin=args.globalMin, maxNCR=args.maxNCR, highNCR_filter=filter_text) if args.cleanAFinfo: # Clean biallelic AF annotation for key in 'AN AC AF N_BI_GENOS N_HOMREF N_HET N_HOMALT FREQ_HOMREF FREQ_HET FREQ_HOMALT'.split(' '): if key in record.info.keys(): record.info.pop(key) # Clean CN frequency annotation for key in 'CN_NUMBER CN_COUNT CN_FREQ CN_NONREF_COUNT CN_NONREF_FREQ'.split(): if key in record.info.keys(): record.info.pop(key) # Standardize SVTYPE for all INS variants, if optioned if any([keyword in record.info['SVTYPE'].split(':') for keyword in 'INS MEI'.split()]): if args.simplify_INS_SVTYPEs: record.info['SVTYPE'] = 'INS' if args.dropEmpties: samps = svu.get_called_samples(record, include_null=False) if len(samps) > 0: fout.write(record) else: fout.write(record) fout.close()
def add_samples(pesr_record, depth_record): # TODO: add pesr/depth FORMAT fields for sample in svu.get_called_samples(depth_record): pesr_record.samples[sample]['GT'] = (0, 1)
def merge_pesr_depth(pesr_vcf, depth_vcf, frac=0.8): # Memory inefficient but it's easier and shouldn't matter too much # now that the variants have been filtered down records = dict() records['pesr'] = {record.id: record for record in pesr_vcf} records['depth'] = {record.id: record for record in depth_vcf} # Wipe MEMBERS from prior clustering for source in 'pesr depth'.split(): for ID, record in records[source].items(): record.info['MEMBERS'] = [ID] # Reset for bedtool creation pesr_vcf.reset() depth_vcf.reset() pesr_bed = svu.vcf2bedtool(pesr_vcf, split_bnd=False, include_strands=False) depth_bed = svu.vcf2bedtool(depth_vcf, split_bnd=False, include_strands=False) # Merge depth records with PE/SR records if they share 80% recip overlap sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, r=True, f=frac) filtered_depth_IDs = deque() for pair in sect.intervals: # Check SV types match if pair.fields[4] != pair.fields[9]: continue pesr_id, depth_id = pair.fields[3], pair.fields[8] # Add depth record's samples to PE/SR filtered_depth_IDs.append(depth_id) pesr_record = records['pesr'][pesr_id] depth_record = records['depth'][depth_id] # Update metadata and samples pesr_record.info['MEMBERS'] = (pesr_record.info['MEMBERS'] + (depth_record.id, )) pesr_record.info['SOURCES'] = pesr_record.info['SOURCES'] + ('depth', ) add_samples(pesr_record, depth_record) # Remove overlapping depth records (not performed in for loop to account # for double overlaps # TODO: handle double overlap of depth calls for ID in set(filtered_depth_IDs): records['depth'].pop(ID) # In remaining depth-only calls, add samples to PE/SR record if the # record covers 90% of the depth-only call. sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, F=0.9) for pair in sect.intervals: # Check SV types match if pair.fields[4] != pair.fields[9]: continue pesr_id, depth_id = pair.fields[3], pair.fields[8] # Skip depth records we already added with 80% reciprocal if depth_id in filtered_depth_IDs: continue # If sample is in both depth record and pe/sr record, remove it from # depth record depth_record = records['depth'][depth_id] pesr_record = records['pesr'][pesr_id] merge_nested_depth_record(pesr_record, depth_record) # Merge records together def _sort_key(record): return (record.chrom, record.pos, record.info['CHR2'], record.stop) pesr_records = sorted(records['pesr'].values(), key=_sort_key) depth_records = sorted(records['depth'].values(), key=_sort_key) for record in heapq.merge(pesr_records, depth_records, key=_sort_key): # Clean out unwanted format keys for key in record.format.keys(): if key != 'GT': del record.format[key] record.info['SOURCES'] = sorted(set(record.info['SOURCES'])) record.info['MEMBERS'] = sorted(set(record.info['MEMBERS'])) # Skip emptied depth records if len(svu.get_called_samples(record)) == 0: continue yield record
def merge_linked_depth_calls(vcf, ID_links, flagged=[]): """ vcf : pysam.VariantFile ID_links : list of (str, str) flagged : list of str Multiallelic sites """ # Make list of linked IDs and build map to corresponding records linked_IDs = sorted(set([ID for link in ID_links for ID in link])) record_map = {} # If a record wasn't linked with a bedtools merge, just return it for record in vcf: if record.id not in linked_IDs: yield record else: record_map[record.id] = record # Ignore links on other chromosomes linked_IDs = sorted(record_map.keys()) ID_links = [ l for l in ID_links if l[0] in linked_IDs and l[1] in linked_IDs ] # Convert links from pairs of IDs to pairs of records record_links = np.empty([len(ID_links), 2], dtype=object) for i, link in enumerate(ID_links): record_links[i, 0] = record_map[link[0]] record_links[i, 1] = record_map[link[1]] clusters = slink(record_links, record_map) # Merge clusters for cluster in clusters: if len(cluster) == 1: yield cluster[0] continue # Take maximal region start = np.min([record.pos for record in cluster]) end = np.max([record.stop for record in cluster]) merged_record = cluster[0].copy() merged_record.pos = start merged_record.stop = end # members = list(record.info['MEMBERS']) + [r.id for r in cluster] # merged_record.info['MEMBERS'] = members # Take union of called samples for record in cluster: called = svu.get_called_samples(record) for sample in called: merged_record.samples[sample]['GT'] = (0, 1) # Flag multiallelic if any([record.id in flagged for record in cluster]): merged_record.info['MULTIALLELIC'] = True yield merged_record