def test_mafs(self): vcf = open(join(TEST_DATA_DIR, 'freebayes_al_depth.vcf')) snps = list(VCFReader(vcf).parse_snvs()) assert snps[0].maf_depth - 0.5 < 0.001 assert snps[0].allele_depths == {0: 1, 1: 1} assert snps[0].depth == 2 assert snps[1].maf_depth - 1.0 < 0.001 assert snps[1].allele_depths == {0: 2, 1: 0} assert snps[4].maf_depth - 0.9890 < 0.001 assert snps[4].allele_depths == {0: 90, 1: 1} assert snps[4].depth == 91 result = [1, 1, 1, 1, 1, 0.944444] for call, res in zip(snps[4].calls, result): assert call.maf_depth - res < 0.001 assert snps[0].mac snps[0].min_calls_for_pop_stats = 3 assert snps[0].maf is None snps[3].min_calls_for_pop_stats = 3 assert snps[3].maf - 0.75 < 0.0001 snps[4].min_calls_for_pop_stats = 3 assert snps[4].maf - 1.0 < 0.0001 assert snps[0].mac == 2 # varscan varscan_fhand = open(join(TEST_DATA_DIR, 'sample.vcf.gz')) reader = VCFReader(fhand=varscan_fhand) snp = list(reader.parse_snvs())[0] snp.min_calls_for_pop_stats = 1 assert snp.maf_depth is None # gatk fhand = open(join(TEST_DATA_DIR, 'gatk_sample.vcf.gz')) reader = VCFReader(fhand=fhand) snp = list(reader.parse_snvs())[0] assert 0.7 < snp.maf_depth < 0.72 assert 0.7 < snp.get_call('hib_amarillo').maf_depth < 0.72 # freebayes fhand = open(join(TEST_DATA_DIR, 'freebayes_sample.vcf.gz')) reader = VCFReader(fhand=fhand) snp = list(reader.parse_snvs())[0] assert 0.99 < snp.maf_depth < 1.01 assert 0.99 < snp.get_call('pep').maf_depth < 1.01
def test_vcf_writer(self): varscan = open(join(TEST_DATA_DIR, 'vari_filter.vcf')) reader = VCFReader(fhand=varscan) out_fhand = NamedTemporaryFile() writer = VCFWriter(out_fhand, reader) for snv in reader.parse_snvs(): writer.write_snv(snv) writer.flush() assert 'CUUC00027_TC01' in open(out_fhand.name).read() writer.close()
def plot_haplotypes(vcf_fhand, plot_fhand, genotype_mode=REFERENCE, filter_alleles_gt=FILTER_ALLELES_GT): reader = VCFReader(vcf_fhand) # collect data genotypes = None samples = [] for snv in reader.parse_snvs(): if genotypes is None: genotypes = {} for call in snv.calls: sample = call.sample genotypes[sample] = [] samples.append(sample) for call in snv.calls: alleles = _get_alleles(call, filter_alleles_gt=filter_alleles_gt) genotypes[call.sample].append(alleles) # draw n_samples = len(samples) xsize = len(genotypes[sample]) / 100 if xsize >= 100: xsize = 100 if xsize <= 8: xsize = 8 ysize = n_samples * 2 if ysize >= 100: ysize = 100 # print xsize, ysize figure_size = (xsize, ysize) fig = Figure(figsize=figure_size) for index, sample in enumerate(samples): axes = fig.add_subplot(n_samples, 1, index) axes.set_title(sample) y_data = genotypes[sample] x_data = [i + 1 for i in range(len(y_data))] x_data, y_data = _flatten_data(x_data, y_data) axes.plot(x_data, y_data, marker='o', linestyle='None', markersize=3.0, markeredgewidth=0, markerfacecolor='red') ylim = axes.get_ylim() ylim = ylim[0] - 0.1, ylim[1] + 0.1 axes.set_ylim(ylim) axes.tick_params(axis='x', bottom='off', top='off', which='both', labelbottom='off') axes.tick_params(axis='y', left='on', right='off', labelleft='off') axes.set_ylabel(sample) canvas = FigureCanvasAgg(fig) canvas.print_figure(plot_fhand, dpi=300) plot_fhand.flush()
def filter_snvs(in_fhand, out_fhand, filters, filtered_fhand=None, log_fhand=None, reader_kwargs=None): '''IT filters an input vcf. The input fhand has to be uncompressed. The original file could be a gzipped file, but in that case it has to be opened with gzip.open before sending it to this function. ''' if reader_kwargs is None: reader_kwargs = {} # The input fhand to this function cannot be compressed reader_kwargs.update({'compressed': False, 'filename': 'pyvcf_bug_workaround'}) reader = VCFReader(in_fhand, **reader_kwargs) template_reader = VCFReader(StringIO(reader.header)) writer = VCFWriter(out_fhand, template_reader=template_reader) if filtered_fhand: filtered_writer = VCFWriter(filtered_fhand, template_reader=template_reader) else: filtered_writer = None packets = group_in_filter_packets(reader.parse_snvs(), SNPS_PER_FILTER_PACKET) tot_snps = 00.01 passed_snps = OrderedDict() broken_pipe = False for packet in packets: tot_snps += len(packet[PASSED]) + len(packet[FILTERED_OUT]) for filter_ in filters: packet = filter_(packet) filter_name = filter_.__class__.__name__ if filter_name not in passed_snps: passed_snps[filter_name] = 0 passed_snps[filter_name] += len(packet[PASSED]) for snv in packet[PASSED]: if not _safe_write_snv(writer, snv): broken_pipe = True break if filtered_writer: for snv in packet[FILTERED_OUT]: if not _safe_write_snv(filtered_writer, snv): broken_pipe = True break if broken_pipe: break if log_fhand: _write_log(log_fhand, tot_snps, passed_snps) writer.flush()
def test_het_unknown(self): vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t./.\t1/.\t ''' vcf = StringIO(VCF_HEADER + vcf) reader = VCFReader(vcf) snps = list(reader.parse_snvs()) snp = snps[0] expected = [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [], [1, None]] assert [call.int_alleles for call in snps[0].calls] == expected assert snp.num_called == 7 out_fhand = StringIO() writer = VCFWriter(out_fhand, reader) for snv in snps: writer.write_snv(snv) assert '1/1\t./.\t1/.' in out_fhand.getvalue()
def test_recomb_rate(self): # samples vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8 20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t3\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t 20\t4\t.\tG\tA\t29\tPASS\tNS=3\tGT\t1/1\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t1/1\t 20\t6\t.\tG\tA\t29\tPASS\tNS=3\tGT\t./.\t./.\t./.\t./.\t./.\t0/1\t0/1\t0/1\t 21\t4\t.\tG\tA\t29\tPASS\tNS=3\tGT\t1/1\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t1/1\t ''' vcf = StringIO(VCF_HEADER + vcf) snps = list(VCFReader(vcf).parse_snvs()) recomb = _calc_recomb_rate(snps[0].record.samples, snps[1].record.samples, 'ril_self') self.assertAlmostEqual(recomb, 0.0, 3) recomb = _calc_recomb_rate(snps[0].record.samples, snps[2].record.samples, 'ril_self') self.assertAlmostEqual(recomb, 0.375, 3) recomb = _calc_recomb_rate(snps[0].record.samples, snps[2].record.samples, 'test_cross') self.assertAlmostEqual(recomb, 0.5, 3) recomb = _calc_recomb_rate(snps[0].record.samples, snps[3].record.samples, 'test_cross') assert recomb is None vcf = '''#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t1_14_1_gbs\t1_17_1_gbs\t1_18_4_gbs\t1_19_4_gbs\t1_26_1_gbs\t1_27_1_gbs1_2_2_gbs\t1_35_13_gbs\t1_3_2_gbs\t1_50_1_gbs\t1_59_1_gbs\t1_63_4_gbs\t1_6_2_gbs\t1_70_1_gbs\t1_74_1_gbs\t1_79_1_gbs\t1_7_2_gbs\t1_81_10_gbs\t1_86_1_gbs\t1_8_2_gbs\t1_91_2_gbs\t1_94_4_gbs\t2_107_1_gbs\t2_10_2_gbs\t2_116_1_gbs\t2_11_1_gbs\t2_125_2_gbs\t2_13_1_gbs\t2_16_3_gbs\t2_21_1_gbs\t2_22A_1_gbs\t2_24_2_gbs\t2_28_2_gbs\t2_31_2_gbs\t2_33_1_gbs\t2_39_3_gbs\t2_43_1_gbs2_5_1_gbs\t2_64_7_gbs\t2_67_2_gbs\t2_6_4_gbs\t2_84_2_gbs\t2_8_3_gbs\t2_95_2_gbs\t4_100B_4_gbs\t4_108_10_gbs\t4_110_11_gbs\t4_111_6_gbs\t4_115B_2_gbs\t4_11B_3_gbs\t4_123B_2_gbs\t4_127_6_gbs\t4_131_1_gbs\t4_136B_3_gbs\t4_136_10_T1_gbs\t4_138B_2_gbs\t4_26_11_gbs\t4_28_4_gbs\t4_33_2_gbs\t4_35_1_gbs\t4_38_2_gbs\t4_39_2_gbs\t4_41B_2_gbs\t4_42_11_gbs\t4_45_2_gbs\t4_53_2_gbs\t4_5_5_gbs\t4_62_4_gbs\t4_64B_1_gbs\t4_65_5_gbs\t4_66_2_gbs\t4_71_2_gbs\t4_72_1_gbs\t4_77_1_gbs\t4_7B_1_gbs\t4_7_2_gbs\t4_81B_2_gbs\t4_82B_4_gbs\t4_85_1_gbs\t4_95_1_gbs\t4_9_1_gbs\t5_14B_1_gbs\t5_15B_1_gbs\t5_18_1_gbs\t5_22_2_gbs\t5_24_2_gbs\t5_25_2_gbs\t5_32_3_gbs\t5_33B_4_gbs\t5_34B_2_gbs\t5_3_1_gbs\t5_40B_2_gbs\t5_49B_2_T1_gbs\t5_57_1_gbs\t5_58_1_gbs\t5_66_1_gbs\t5_80B_2_gbs\tMU_16_5_gbs\tV_196_2_gbs\t1\t2 s7\t4039693\tS7_4039693\tT\tG\t.\tPASS\tIV0=F\tGT\t0/0\t0/0\t0/0\t1/1\t0/0\t1/1\t1/1\t1/1\t1/1\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t1/1\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t1/1\t0/0\t0/0\t1/1\t1/1\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t./.\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t1/1\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t1/1\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t1/1 s7\t4028261\tS7_4028261\tC\tT\t.\tPASS\tIV0=F\tGT\t1/1\t1/1\t./.\t0/0\t1/1\t0/0\t./.\t0/0\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t0/0\t1/1\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t1/1\t1/1\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t0/0\t1/1\t0/0\t1/1\t1/1\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t0/0\t./.\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/1\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0 ''' vcf = StringIO(VCF_HEADER + vcf) reader = VCFReader(vcf) snps = list(reader.parse_snvs()) recomb = _calc_recomb_rate(snps[0].record.samples, snps[1].record.samples, 'ril_self') self.assertAlmostEqual(recomb, 0.8187, 3)
def run_genotype_filters(in_fhand, out_fhand, gt_filters, reader_kwargs=None): if reader_kwargs is None: reader_kwargs = {} reader_kwargs['filename'] = 'pyvcf_bug_workaround' reader_kwargs['compressed'] = False reader = VCFReader(in_fhand, **reader_kwargs) templa_reader = VCFReader(StringIO(reader.header)) writer = VCFWriter(out_fhand, template_reader=templa_reader) for snv in reader.parse_snvs(): for mapper in gt_filters: snv = mapper(snv) try: writer.write_snv(snv) except IOError, error: # The pipe could be already closed if 'Broken pipe' in str(error): break else: raise
def filter_vcf(self, vcf_fpath, min_samples=DEF_MIN_CALLS_FOR_POP_STATS): reader = VCFReader(open(vcf_fpath), min_calls_for_pop_stats=min_samples) snvs = reader.parse_snvs() random_reader = VCFReader(open(vcf_fpath)) for snv_1 in snvs: self.tot_snps += 1 loc = snv_1.pos win_1_start = loc - (self.win_width / 2) if win_1_start < 0: win_1_start = 0 win_1_end = loc - (self.win_mask_width / 2) if win_1_end < 0: win_1_end = 0 if win_1_end != 0: snvs_win_1 = random_reader.fetch_snvs(snv_1.chrom, start=int(win_1_start), end=int(win_1_end)) else: snvs_win_1 = [] win_2_start = loc + (self.win_mask_width / 2) win_2_end = loc + (self.win_width / 2) snvs_win_2 = random_reader.fetch_snvs(snv_1.chrom, start=win_2_start, end=win_2_end) snvs_in_win = list(snvs_win_1) + list(snvs_win_2) if len(snvs_in_win) > self.num_snvs_check: snvs_in_win = random.sample(snvs_in_win, self.num_snvs_check) if len(snvs_in_win) < self.min_num_snvs_check_in_win: # Not enough snps to check continue exp_cnts = snv_1.biallelic_genotype_counts if exp_cnts is None: continue results = {'left': [], 'right': []} values = {'left': [], 'right': []} for snv_2 in snvs_in_win: location = 'left' if snv_2.pos - loc < 0 else 'right' obs_cnts = snv_2.biallelic_genotype_counts if obs_cnts is None: continue value = _fisher_extact_rxc(obs_cnts, exp_cnts) result = False if value is None else value > self.alpha results[location].append(result) values[location].append((snv_2.pos, value)) if (len(results['left']) + len(results['right']) < self.min_num_snvs_check_in_win): # few snps can be tested for segregation continue n_failed_left = results['left'].count(False) n_failed_right = results['right'].count(False) tot_checked = len(results['left']) + len(results['right']) if tot_checked > 0: failed_freq = (n_failed_left + n_failed_right) / tot_checked passed = self.max_failed_freq > failed_freq else: failed_freq = None passed = False if failed_freq is not None: self._failed_freqs.append(failed_freq) if passed: self.passed_snps += 1 yield snv_1
class VcfStats(object): def __init__(self, vcf_fpath, gq_threshold=None, dp_threshold=100, min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS, remarkable_coverages=None, window_size=WINDOWS_SIZE): if remarkable_coverages is None: remarkable_depths = REMARKABLE_DEPTHS self.remarkable_depths = remarkable_depths self._reader = VCFReader(open(vcf_fpath), min_calls_for_pop_stats=min_calls_for_pop_stats) self._random_reader = pyvcfReader(filename=vcf_fpath) self.window_size = window_size self._gq_threshold = 0 if gq_threshold is None else gq_threshold self.dp_threshold = dp_threshold self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()} self._ac2d = _AlleleCounts2D() self.sample_dp_coincidence = {1: IntCounter()} for cov in remarkable_depths: self.sample_dp_coincidence[cov] = IntCounter() self.called_snvs = 0 self.called_gts = IntCounter() # sample_counter self._sample_counters = {} for counter_name in SAMPLE_COUNTERS: if counter_name not in self._sample_counters: self._sample_counters[counter_name] = {} for sample in self._reader.samples: if counter_name in (GT_DEPTHS, GT_QUALS): counters = {HOM: IntCounter(), HET: IntCounter()} else: counters = IntCounter() self._sample_counters[counter_name][sample] = counters self._snv_counters = {MAFS: IntCounter(), MACS: IntCounter(), MAFS_DP: IntCounter(), SNV_QUALS: IntCounter(), HET_IN_SNP: IntCounter(), SNV_DENSITY: IntCounter(), INBREED_F_IN_SNP: IntCounter(), DEPTHS: IntCounter()} self._calculate() def _add_depth(self, snp): depth = snp.depth if depth is None: depth = 0 self._snv_counters[DEPTHS][depth] += 1 def _add_maf_and_mac(self, snp): maf = snp.maf if maf: maf = int(round(maf * 100)) self._snv_counters[MAFS][maf] += 1 mac = snp.mac if mac: self._snv_counters[MACS][mac] += 1 def _add_maf_dp(self, snp): maf_dp = snp.maf_depth if maf_dp is not None: self._snv_counters[MAFS_DP][int(round(maf_dp * 100))] += 1 for call in snp.calls: maf_dp = call.maf_depth if maf_dp is None: continue sample = call.sample maf_depth = int(round(maf_dp * 100)) self._sample_counters[MAFS_DP][sample][maf_depth] += 1 def _add_snv_qual(self, snp): snv_qual = snp.qual if snv_qual is not None: self._snv_counters[SNV_QUALS][int(round(snv_qual))] += 1 def _add_snv_density(self, snp): windows_size = self.window_size pos = snp.pos start = pos - windows_size if pos - windows_size > windows_size else 0 end = pos + windows_size chrom = snp.chrom num_snvs = len(list(self._random_reader.fetch(chrom, start, end))) - 1 self._snv_counters[SNV_DENSITY][num_snvs] += 1 def _add_snv_het_obs_fraction(self, snp): obs_het = snp.obs_het if obs_het is None: return self._snv_counters[HET_IN_SNP][int(round(obs_het * 100))] += 1 inbreed_coef = snp.inbreed_coef if inbreed_coef is None: return inbreed_coef = int(round(inbreed_coef * 100)) self._snv_counters[INBREED_F_IN_SNP][inbreed_coef] += 1 @staticmethod def _num_samples_higher_equal_dp(depth, snp): n_samples = 0 for call in snp.calls: if not call.called: continue if call.depth >= depth: n_samples += 1 return n_samples def _calculate(self): snp_counter = 0 for snp in self._reader.parse_snvs(): snp_counter += 1 self._add_maf_dp(snp) self._add_maf_and_mac(snp) self._add_snv_qual(snp) self._add_snv_density(snp) self._add_snv_het_obs_fraction(snp) self._add_depth(snp) for depth, counter in self.sample_dp_coincidence.viewitems(): n_samples = self._num_samples_higher_equal_dp(depth, snp) counter[n_samples] += 1 n_gt_called = 0 for call in snp.calls: if not call.called: continue n_gt_called += 1 sample_name = call.sample ref_depth = call.ref_depth acs = call.alt_sum_depths gt_type = call.gt_type gt_broud_type = HET if call.is_het else HOM depth = call.depth gt_qual = call.gt_qual if depth is not None and depth < self.dp_threshold: self._gt_qual_depth_counter[gt_broud_type].append(depth, gt_qual) # CHECK THIS. This is an special case where the only info we # have is the genotype if gt_qual is None: self._sample_counters[GT_TYPES][sample_name][gt_type] += 1 if depth is not None: self._sample_counters[GT_DEPTHS][sample_name][gt_broud_type][depth] += 1 elif gt_qual >= self._gq_threshold: self._sample_counters[GT_TYPES][sample_name][gt_type] += 1 self._sample_counters[GT_QUALS][sample_name][gt_broud_type][gt_qual] += 1 self._sample_counters[GT_DEPTHS][sample_name][gt_broud_type][depth] += 1 self._ac2d.add(rc=ref_depth, acs=acs, gt=call.int_alleles, gq=gt_qual) self.called_gts[n_gt_called] += 1 self.called_snvs += 1 def _get_sample_counter(self, kind, sample=None, gt_broud_type=None): counters = self._sample_counters[kind] if sample is not None: if gt_broud_type is None: return counters[sample] else: return counters[sample][gt_broud_type] all_counters = IntCounter() for sample_counter in counters.values(): if gt_broud_type is None: all_counters += sample_counter else: all_counters += sample_counter[gt_broud_type] return all_counters def macs(self): return self._snv_counters[MACS] def mafs(self): return self._snv_counters[MAFS] def mafs_dp(self, sample=None): if sample is None: return self._snv_counters[MAFS_DP] return self._get_sample_counter(MAFS_DP, sample) def gt_depths(self, gt_broud_type, sample=None): return self._get_sample_counter(GT_DEPTHS, sample, gt_broud_type=gt_broud_type) def gt_quals(self, gt_broud_type, sample=None): return self._get_sample_counter(GT_QUALS, sample, gt_broud_type=gt_broud_type) def heterozigosity_for_sample(self, sample): sample_gt_types = self._get_sample_counter(GT_TYPES, sample) het_gt = sample_gt_types[HET] all_gts = sample_gt_types.count try: heterozigosity = het_gt / all_gts except ZeroDivisionError: heterozigosity = 0 return heterozigosity def gt_types(self, sample=None): return self._get_sample_counter(GT_TYPES, sample) @property def samples(self): return self._reader.samples @property def min_calls_for_pop_stats(self): return self._reader.min_calls_for_pop_stats @property def snv_density(self): return self._snv_counters[SNV_DENSITY] @property def snv_quals(self): return self._snv_counters[SNV_QUALS] @property def het_by_snp(self): return self._snv_counters[HET_IN_SNP] @property def inbreeding_by_snp(self): return self._snv_counters[INBREED_F_IN_SNP] @property def allelecount2d(self): return self._ac2d @property def gt_depths_by_gt_and_qual(self): return self._gt_qual_depth_counter @property def depths(self): return self._snv_counters[DEPTHS]