def make_record(self): self.vcf_record = self.records[0].copy() called_samples = set(svu.get_called_samples(self.vcf_record)) # Take union of called samples for record in itertools.islice(self.records, 1, None): cs = svu.get_called_samples(record) for sample in cs: if sample not in called_samples: self.vcf_record.samples[sample]['GT'] = (0, 1) called_samples.add(sample)
def filter_dn_variants(vcf, metrics, fout): CNV = 'DEL DUP'.split() is_cnv = metrics.svtype.isin(CNV) depth_only = metrics.sources == 'depth' pesr_size_filter = (metrics.svsize >= 1000) passing = ((depth_only & is_cnv & metrics.rd_pass) | (~depth_only & is_cnv & pesr_size_filter & metrics.rd_pass) | (~depth_only & is_cnv & ~pesr_size_filter & metrics.pesr_pass) | (~is_cnv & metrics.pesr_pass)) def _join_samples(s): return sorted(set(s)) passes = metrics.loc[passing].groupby('name')['sample'].agg(_join_samples) fails = metrics.loc[~passing].groupby('name')['sample'].agg(_join_samples) checked_variants = metrics.name.unique() for record in vcf: # Write records unaltered if they weren't included in the de novo check if record.id not in checked_variants: fout.write(record) # Otherwise set samples appropriately else: pass_samples = passes.get(record.id, []) for sample in pass_samples: record.samples[sample]['GT'] = (0, 1) fail_samples = fails.get(record.id, []) for sample in fail_samples: set_null(record, sample) # Only report record if any samples made it through de novo check if len(svu.get_called_samples(record)) > 0: fout.write(record)
def choose_background(self, record, whitelist=None, blacklist=None): # Select called and background samples called = svu.get_called_samples(record) background = [s for s in self.samples if s not in called] # Permit override of specified white/blacklists whitelist = whitelist if whitelist is not None else self.whitelist blacklist = blacklist if blacklist is not None else self.blacklist def _filter_whitelist(samples): return [s for s in samples if s in whitelist] def _filter_blacklist(samples): return [s for s in samples if s not in blacklist] called = _filter_whitelist(called) background = _filter_whitelist(background) called = _filter_blacklist(called) background = _filter_blacklist(background) if len(background) >= self.n_background: background = np.random.choice(background, self.n_background, replace=False).tolist() return called, background
def process_metadata(variants, bed=False, batch_list=None): if bed: samples = [s.strip() for s in batch_list.readlines()] else: samples = list(variants.header.samples) parents = [s for s in samples if _is_parent(s)] children = [s for s in samples if _is_child(s)] n_parents = len(parents) n_children = len(children) metadata = deque() for variant in variants: # bed record if bed: if variant.startswith('#'): continue data = variant.strip().split() called = data[4].split(',') name = data[3] svtype = data[5] # VCF record else: called = svu.get_called_samples(variant) name = variant.id svtype = variant.info['SVTYPE'] # Calculate parental VF parents = [s for s in called if _is_parent(s)] if n_parents > 0: parental_vf = len(parents) / n_parents else: parental_vf = 0 children = [s for s in called if _is_child(s)] if n_children > 0: child_vf = len(children) / n_children else: child_vf = 0 if child_vf > 0: inh_rate = get_inh_rate(called) else: inh_rate = 0 dat = [name, svtype, parental_vf, child_vf, inh_rate] metadata.append(dat) metadata = np.array(metadata) cols = 'name svtype parental_vf child_vf inh_rate'.split() metadata = pd.DataFrame(metadata, columns=cols) return metadata
def samples_overlap(recA, recB, upper_thresh=0.8, lower_thresh=0.5): """ Report if the samples called in two VCF records overlap sufficiently. The fraction of each record's samples which are shared with the other record is calculated. The record with a greater fraction of shared samples must exceed the upper threshold AND the record with a lesser fraction of shared samples must exceed the lower threshold. This is intended to maximize sensitivity in rare variants with a false negative in one breakpoint. Parameters ---------- recA : pysam.VariantRecord recB : pysam.VariantRecord upper_thresh : float, optional Minimum sample overlap in record with greater overlap lower_thresh : float, optional Minimum sample overlap in record with lesser overlap Returns ------- samples_overlap : bool Samples shared between records meet required thresholds. """ # Get lists of called samples for each record samplesA = set(svu.get_called_samples(recA)) samplesB = set(svu.get_called_samples(recB)) # Compute fraction of each record's samples which are shared shared = samplesA & samplesB fracA = len(shared) / len(samplesA) fracB = len(shared) / len(samplesB) min_frac, max_frac = sorted([fracA, fracB]) return min_frac >= lower_thresh and max_frac >= upper_thresh
def dn_test(record, parents, config): called = svu.get_called_samples(record) for i, parent in enumerate(parents): others = parents[:i] + parents[i + 1:] blacklist = called + others samples = record.samples.keys() whitelist = [s for s in samples if s not in blacklist] # PE Test pe = PEBreakpoint.from_vcf(record) pe.samples = [parent] pe.pe_test(whitelist, config.discfile, n_background=160, window_in=50, window_out=500) stats = pe.stats stats['name'] = pe.name stats['sample'] = parent cols = 'name sample log_pval called_median bg_median'.split() stats[cols].to_csv(config.petest, sep='\t', index=False, header=False, na_rep='NA') # SR Test sr = SRBreakpoint.from_vcf(record) sr.samples = [parent] sr.sr_test(whitelist, config.countfile, n_background=160, window=50) pvals = sr.best_pvals pvals['sample'] = parent cols = 'name sample coord pos log_pval called_median bg_median'.split() pvals = pvals[cols].fillna(0) int_cols = ['pos'] # called_median bg_median'.split() for col in int_cols: pvals[col] = pvals[col].round().astype(int) pvals.log_pval = np.abs(pvals.log_pval) pvals.to_csv(config.srtest, sep='\t', index=False, header=False, na_rep='NA')
def from_vcf(cls, record): """ Parameters ---------- record : pysam.VariantRecord """ chrA = record.chrom posA = record.pos chrB = record.info['CHR2'] posB = record.stop name = record.id strands = record.info['STRANDS'] samples = svu.get_called_samples(record) return cls(chrA, posA, chrB, posB, name, samples, strands)
def count_svtypes(vcf): """ Count instances of each SVTYPE in each sample in a VCF. Parameters ---------- vcf : pysam.VariantFile Returns ------- counts : pd.DataFrame Columns: sample, svtype, count """ samples = list(vcf.header.samples) # Initialize counts per sample - each dict is keyed on svtype count_dict = {} for sample in samples: count_dict[sample] = defaultdict(int) for record in vcf: for sample in svu.get_called_samples(record): # Count the SVTYPE if it's present, otherwise increment NO_SVTYPE if 'SVTYPE' in record.info.keys(): count_dict[sample][record.info['SVTYPE']] += 1 else: count_dict[sample]['NO_SVTYPE'] += 1 # Convert to dataframe, adding zeros to samples with no instances of a # given svtype counts = pd.DataFrame.from_dict(count_dict, orient='index')\ .fillna(0).astype(int)\ .reset_index().rename(columns={'index': 'sample'}) # Tidy data from "column-per-svtype" format counts = pd.melt(counts, id_vars=['sample'], var_name='svtype', value_name='count') return counts