Exemple #1
0
def filter_snvs(in_fhand,
                out_fhand,
                filters,
                filtered_fhand=None,
                log_fhand=None,
                reader_kwargs=None):
    '''It filters an input vcf.

    The input fhand has to be uncompressed. The original file could be a
    gzipped file, but in that case it has to be opened with gzip.open before
    sending it to this function.
    '''
    if reader_kwargs is None:
        reader_kwargs = {}
    # The input fhand to this function cannot be compressed
    reader_kwargs.update({
        'compressed': False,
        'filename': 'pyvcf_bug_workaround'
    })

    reader = VCFReader(in_fhand, **reader_kwargs)

    template_reader = VCFReader(StringIO(reader.header))
    writer = VCFWriter(out_fhand, template_reader=template_reader)
    if filtered_fhand:
        filtered_writer = VCFWriter(filtered_fhand,
                                    template_reader=template_reader)
    else:
        filtered_writer = None

    packets = group_in_filter_packets(reader.parse_snvs(),
                                      SNPS_PER_FILTER_PACKET)
    tot_snps = 00.01
    passed_snps = OrderedDict()
    broken_pipe = False
    for packet in packets:
        tot_snps += len(packet[PASSED]) + len(packet[FILTERED_OUT])
        for filter_ in filters:
            packet = filter_(packet)
            filter_name = filter_.__class__.__name__
            if filter_name not in passed_snps:
                passed_snps[filter_name] = 0
            passed_snps[filter_name] += len(packet[PASSED])

        for snv in packet[PASSED]:
            if not _safe_write_snv(writer, snv):
                broken_pipe = True
                break
        if filtered_writer:
            for snv in packet[FILTERED_OUT]:
                if not _safe_write_snv(filtered_writer, snv):
                    broken_pipe = True
                    break
        if broken_pipe:
            break

    if log_fhand:
        _write_log(log_fhand, tot_snps, passed_snps)

    writer.flush()
Exemple #2
0
def run_genotype_filters(in_fhand,
                         out_fhand,
                         gt_filters,
                         plots_dir=None,
                         reader_kwargs=None):
    if reader_kwargs is None:
        reader_kwargs = {}

    reader_kwargs['filename'] = 'pyvcf_bug_workaround'
    reader_kwargs['compressed'] = False
    reader = VCFReader(in_fhand, **reader_kwargs)

    templa_reader = VCFReader(StringIO(reader.header))
    writer = VCFWriter(out_fhand, template_reader=templa_reader)

    for snv in reader.parse_snvs():
        for mapper in gt_filters:
            snv = mapper(snv)
        try:
            writer.write_snv(snv)
        except IOError, error:
            # The pipe could be already closed
            if 'Broken pipe' in str(error):
                break
            else:
                raise
Exemple #3
0
    def test_cons_recomb(self):
        vcf_fpath = os.path.join(TEST_DATA_DIR, 'scaff000025.vcf.gz')
        snvs = VCFReader(open(vcf_fpath)).parse_snvs()
        snv_filter = WeirdRecombFilter(pop_type='ril_self')
        flt_snvs = snv_filter.filter_snvs(snvs)
        assert len(list(flt_snvs)) == 258
        assert snv_filter.not_fitted_counter['no close region left'] == 10
        fhand = NamedTemporaryFile(suffix='.png')
        flt_snvs = snv_filter.plot_recomb_at_0_dist_hist(fhand)
        assert len(snv_filter.recomb_rates['ok']) == 245
        assert len(snv_filter.recomb_rates['ok_conf_is_None']) == 13
        assert len(snv_filter.recomb_rates['not_ok']) == 14

        snvs = VCFReader(open(vcf_fpath)).parse_snvs()
        snv_filter = WeirdRecombFilter(pop_type='ril_self',
                                       max_zero_dist_recomb=0.07,
                                       alpha_recomb_0=None)
        flt_snvs = snv_filter.filter_snvs(snvs)
        assert len(list(flt_snvs)) == 266
        assert snv_filter.not_fitted_counter['no close region left'] == 10
        fhand = NamedTemporaryFile(suffix='.png')
        flt_snvs = snv_filter.plot_recomb_at_0_dist_hist(fhand)
        assert len(snv_filter.recomb_rates['ok']) == 0
        assert len(snv_filter.recomb_rates['ok_conf_is_None']) == 266
        assert len(snv_filter.recomb_rates['not_ok']) == 6

        fhand = StringIO()
        snv_filter.write_log(fhand)
        assert 'SNVs processed: 282' in fhand.getvalue()
Exemple #4
0
    def test_high_variable_region_filter(self):
        records = VCFReader(open(VCF_PATH),
                            min_calls_for_pop_stats=1).parse_snvs()
        bulk_filter = HighVariableRegion(max_variability=0.02,
                                         window_in_bp=101,
                                         ref_fpath=REF_PATH)
        records = list(bulk_filter(records))
        assert bulk_filter.name in records[0].filters
        assert bulk_filter.name not in records[3].filters
        records = VCFReader(open(VCF_PATH),
                            min_calls_for_pop_stats=1).parse_snvs()
        bulk_filter = HighVariableRegion(max_variability=0.05,
                                         window_in_bp=101,
                                         ref_fpath=REF_PATH)
        records = list(bulk_filter(records))
        assert bulk_filter.name not in records[0].filters

        assert bulk_filter.name == 'hv0.05'
        desc = 'The region has more than 5 snvs per 101 bases'
        assert desc in bulk_filter.description

        records = VCFReader(open(VCF_PATH),
                            min_calls_for_pop_stats=1).parse_snvs()
        bulk_filter = HighVariableRegion(max_variability=0.003,
                                         window_in_bp=11,
                                         ref_fpath=REF_PATH)
        records = list(bulk_filter(records))
        assert bulk_filter.name in records[0].filters
        records = VCFReader(open(VCF_PATH),
                            min_calls_for_pop_stats=1).parse_snvs()
        bulk_filter = HighVariableRegion(max_variability=0.003,
                                         window_in_bp=101,
                                         ref_fpath=REF_PATH)
        records = list(bulk_filter(records))
        assert bulk_filter.name in records[0].filters
Exemple #5
0
 def test_get_snpcaller(self):
     varscan = open(join(TEST_DATA_DIR, 'sample.vcf.gz'))
     gatk = open(join(TEST_DATA_DIR, 'gatk_sample.vcf.gz'))
     freebayes = open(join(TEST_DATA_DIR, 'freebayes_sample.vcf.gz'))
     assert VCFReader(fhand=varscan).snpcaller == VARSCAN
     assert VCFReader(fhand=gatk).snpcaller == GATK
     assert VCFReader(fhand=freebayes).snpcaller == FREEBAYES
     tassel = open(join(TEST_DATA_DIR, 'generic.vcf.gz'))
     assert VCFReader(fhand=tassel).snpcaller == GENERIC
Exemple #6
0
 def test_vcf_writer(self):
     varscan = open(join(TEST_DATA_DIR, 'vari_filter.vcf'))
     reader = VCFReader(fhand=varscan)
     out_fhand = NamedTemporaryFile()
     writer = VCFWriter(out_fhand, reader)
     for snv in reader.parse_snvs():
         writer.write_snv(snv)
     writer.flush()
     assert 'CUUC00027_TC01' in open(out_fhand.name).read()
     writer.close()
Exemple #7
0
def plot_haplotypes(vcf_fhand, plot_fhand, genotype_mode=REFERENCE,
                    filter_alleles_gt=FILTER_ALLELES_GT):
    reader = VCFReader(vcf_fhand)

    # collect data
    genotypes = None
    samples = []
    for snv in reader.parse_snvs():
        if genotypes is None:
            genotypes = {}
            for call in snv.calls:
                sample = call.sample
                genotypes[sample] = []
                samples.append(sample)

        for call in snv.calls:
            alleles = _get_alleles(call, filter_alleles_gt=filter_alleles_gt)
            genotypes[call.sample].append(alleles)

    # draw
    n_samples = len(samples)
    xsize = len(genotypes[sample]) / 100
    if xsize >= 100:
        xsize = 100
    if xsize <= 8:
        xsize = 8
    ysize = n_samples * 2
    if ysize >= 100:
        ysize = 100
    # print xsize, ysize
    figure_size = (xsize, ysize)

    fig = Figure(figsize=figure_size)

    for index, sample in enumerate(samples):
        axes = fig.add_subplot(n_samples, 1, index)
        axes.set_title(sample)
        y_data = genotypes[sample]
        x_data = [i + 1 for i in range(len(y_data))]
        x_data, y_data = _flatten_data(x_data, y_data)

        axes.plot(x_data, y_data, marker='o',
                  linestyle='None', markersize=3.0, markeredgewidth=0,
                  markerfacecolor='red')
        ylim = axes.get_ylim()
        ylim = ylim[0] - 0.1, ylim[1] + 0.1
        axes.set_ylim(ylim)
        axes.tick_params(axis='x', bottom='off', top='off', which='both',
                         labelbottom='off')
        axes.tick_params(axis='y', left='on', right='off', labelleft='off')
        axes.set_ylabel(sample)

    canvas = FigureCanvas(fig)
    canvas.print_figure(plot_fhand, dpi=300)
    plot_fhand.flush()
Exemple #8
0
def filter_snvs(in_fhand, out_fhand, filters, filtered_fhand=None,
                log_fhand=None, reader_kwargs=None):
    '''It filters an input vcf.

    The input fhand has to be uncompressed. The original file could be a
    gzipped file, but in that case it has to be opened with gzip.open before
    sending it to this function.
    '''
    if reader_kwargs is None:
        reader_kwargs = {}
    # The input fhand to this function cannot be compressed
    reader_kwargs.update({'compressed': False,
                         'filename': 'pyvcf_bug_workaround'})

    reader = VCFReader(in_fhand, **reader_kwargs)

    template_reader = VCFReader(StringIO(reader.header))
    writer = VCFWriter(out_fhand, template_reader=template_reader)
    if filtered_fhand:
        filtered_writer = VCFWriter(filtered_fhand,
                                    template_reader=template_reader)
    else:
        filtered_writer = None

    packets = group_in_filter_packets(reader.parse_snvs(),
                                      SNPS_PER_FILTER_PACKET)
    tot_snps = 00.01
    passed_snps = OrderedDict()
    broken_pipe = False
    for packet in packets:
        tot_snps += len(packet[PASSED]) + len(packet[FILTERED_OUT])
        for filter_ in filters:
            packet = filter_(packet)
            filter_name = filter_.__class__.__name__
            if filter_name not in passed_snps:
                passed_snps[filter_name] = 0
            passed_snps[filter_name] += len(packet[PASSED])

        for snv in packet[PASSED]:
            if not _safe_write_snv(writer, snv):
                broken_pipe = True
                break
        if filtered_writer:
            for snv in packet[FILTERED_OUT]:
                if not _safe_write_snv(filtered_writer, snv):
                    broken_pipe = True
                    break
        if broken_pipe:
            break

    if log_fhand:
        _write_log(log_fhand, tot_snps, passed_snps)

    writer.flush()
Exemple #9
0
    def __init__(self,
                 vcf_fpath,
                 gq_threshold=None,
                 dp_threshold=100,
                 min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS,
                 remarkable_coverages=None,
                 window_size=WINDOWS_SIZE):
        if remarkable_coverages is None:
            remarkable_depths = REMARKABLE_DEPTHS
        self.remarkable_depths = remarkable_depths

        self._reader = VCFReader(
            open(vcf_fpath), min_calls_for_pop_stats=min_calls_for_pop_stats)

        self._random_reader = pyvcfReader(filename=vcf_fpath)

        self.window_size = window_size
        self._gq_threshold = 0 if gq_threshold is None else gq_threshold

        self.dp_threshold = dp_threshold
        self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()}
        self._ac2d = _AlleleCounts2D()

        self.sample_dp_coincidence = {1: IntCounter()}
        for cov in remarkable_depths:
            self.sample_dp_coincidence[cov] = IntCounter()

        self.called_snvs = 0
        self.called_gts = IntCounter()

        # sample_counter
        self._sample_counters = {}

        for counter_name in SAMPLE_COUNTERS:
            if counter_name not in self._sample_counters:
                self._sample_counters[counter_name] = {}
            for sample in self._reader.samples:
                if counter_name in (GT_DEPTHS, GT_QUALS):
                    counters = {HOM: IntCounter(), HET: IntCounter()}
                else:
                    counters = IntCounter()
                self._sample_counters[counter_name][sample] = counters

        self._snv_counters = {
            MAFS: IntCounter(),
            MACS: IntCounter(),
            MAFS_DP: IntCounter(),
            SNV_QUALS: IntCounter(),
            HET_IN_SNP: IntCounter(),
            SNV_DENSITY: IntCounter(),
            INBREED_F_IN_SNP: IntCounter(),
            DEPTHS: IntCounter()
        }
        self._calculate()
Exemple #10
0
    def test_close_to_filter(self):
        records = list(
            VCFReader(open(FREEBAYES_VCF_PATH),
                      min_calls_for_pop_stats=1).parse_snvs())
        rec1 = records[1].copy()
        filter_ = CloseToSnv(distance=300, max_maf_depth=None)
        filter_(rec1)
        assert filter_.name in rec1.filters

        rec1 = records[1].copy()
        filter_ = CloseToSnv(distance=300, max_maf_depth=0.5)
        filter_(rec1)
        assert rec1.filters is None

        rec1 = records[1].copy()
        filter_ = CloseToSnv(distance=300, max_maf_depth=0.8)
        filter_(rec1)
        assert filter_.name in rec1.filters

        assert filter_.name == 'cs300_0.80'
        desc = 'The snv is closer than 300 nucleotides to another snv, '
        desc += 'with maf:0.80'
        assert desc in filter_.description

        rec1 = records[1].copy()
        filter_ = CloseToSnv(distance=300, max_maf_depth=0.8, snv_type='snp')
        filter_(rec1)
        assert filter_.name in rec1.filters
Exemple #11
0
    def test_filter_calls(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.
20\t17330\t.\tT\tA\t3\tq10\tNS=3;DP=11;AF=0.017\tGT:GQ:DP:HQ\t0|0:49:3:58,50\t0|1:3:5:65,3\t0/0:41:3
20\t1110696\trs6040355\tA\tG,T\t67\tPASS\tNS=2;DP=10;AF=0.333,0.667;AA=T;DB\tGT:GQ:DP:HQ\t1|2:21:6:23,27\t2|1:2:0:18,2\t2/2:35:4
20\t1230237\t.\tT\t.\t47\tPASS\tNS=3;DP=13;AA=T\tGT:GQ:DP:HQ\t0|0:54:7:56,60\t0|0:48:4:51,51\t0/0:61:2
20\t1234567\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t0/1:35:4\t0/2:17:2\t1/1:40:3
20\t1234567\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t./.:35:4\t0/2:17:2\t1/1:40:3
'''
        vcf = StringIO(VCF_HEADER + vcf)
        snps = list(VCFReader(vcf).parse_snvs())
        snp = snps[4]
        assert len(snp.alleles) == 3
        snp_filtered = snp.filter_calls_by_sample(samples=('NA00003',))
        assert len(snp_filtered.alleles) == 2

        snp = snps[1]
        snp_filtered = snp.filter_calls_by_sample(samples=('NA00003',))
        assert len(snp_filtered.calls) == 1

        snp = snps[1]
        snp_filtered = snp.filter_calls_by_sample(samples=('NA00003',),
                                                  reverse=True)
        assert len(snp_filtered.calls) == 2

        try:
            snp_filtered = snp.filter_calls_by_sample(samples=('NA0003',),
                                                      reverse=True)
            self.fail("KeyError Expected")
        except KeyError:
            pass
        assert len(snp_filtered.calls) == 2
Exemple #12
0
    def test_genotype_freq(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT N1 N2 N3
20\t1\t.\tG\tA\t2\tq1\tNS=3\tGT\t0|0\t1|0\t1/1
20\t2\t.\tT\tA\t3\tq1\tNS=3\tGT\t0|0\t0|0\t0/.
20\t3\t.\tT\tA\t3\tq1\tNS=3\tGT\t1|1\t1|1\t./.
20\t4\t.\tT\tA\t3\tq1\tNS=3\tGT\t.\t.\t.
20\t5\t.\tT\tA\t3\tq1\tNS=3\tGT\t0|2\t1|1\t1/.
20\t5\t.\tT\tA\t3\tq1\tNS=3\tGT\t0|1\t1|0\t1/.
20\t5\t.\tT\tA\t3\tq1\tNS=3\tGT\t0|0\t0|0\t1/.
20\t5\t.\tT\tA\t3\tq1\tNS=3\tGT\t1|1\t0|0\t1/0
'''
        vcf = StringIO(VCF_HEADER2 + vcf)
        snps = list(VCFReader(vcf, min_calls_for_pop_stats=1).parse_snvs())

        assert snps[0].genotype_counts == {(0, 0): 1, (0, 1): 1, (1, 1): 1}
        assert snps[1].genotype_counts == {(0, 0): 2, (None, 0): 1}
        assert snps[2].genotype_counts == {(1, 1): 2}
        assert snps[3].genotype_counts is None

        self.assertAlmostEqual(snps[0].genotype_freqs[(0, 0)], 0.33333, 4)

        assert snps[4].biallelic_genotype_counts == (1, 0, 1)
        assert snps[5].biallelic_genotype_counts == (0, 2, 0)
        assert snps[6].biallelic_genotype_counts == (2, 0, 0)
        assert snps[7].biallelic_genotype_counts == (1, 1, 1)
        self.assertAlmostEqual(snps[7].biallelic_genotype_freqs[0], 0.33333, 4)
Exemple #13
0
    def test_r_example(self):
        # r examples
        self.assertAlmostEqual(_calculate_r_sqr(HaploCount(10, 10, 10, 10)),
                               0)
        self.assertAlmostEqual(_fisher_exact(HaploCount(10, 10, 10, 10)), 1)
        self.assertAlmostEqual(_calculate_r_sqr(HaploCount(10, 0, 0, 10)), 1)

        self.assertAlmostEqual(_calculate_r_sqr(HaploCount(441, 13, 111, 435)),
                               0.591332576)
        self.assertAlmostEqual(_fisher_exact(HaploCount(6, 6, 2, 6)),
                               0.3728506787)
        self.assertAlmostEqual(_fisher_exact(HaploCount(1, 0, 5, 7)),
                               0.4615385)
        self.assertAlmostEqual(_fisher_exact(HaploCount(5, 23, 1, 20)),
                               0.219157345)

        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7
20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t./.\t
20\t3\t.\tG\tA\t29\tPASS\tNS=3\tGT\t3/3\t3/3\t3/3\t2/2\t2/2\t3/3\t3/3\t'''

        vcf = StringIO(VCF_HEADER + vcf)
        snps = list(VCFReader(vcf).parse_snvs())
        ld_stats = calculate_ld_stats(snps[0], snps[1])
        self.assertAlmostEqual(ld_stats.fisher, 0.39999999999)
        self.assertAlmostEqual(ld_stats.r_sqr, 0.49999999)
Exemple #14
0
 def _create_reader_from_snv(snv):
     orig_reader = snv.reader
     fpath = orig_reader.fhand.name
     min_calls_for_pop_stats = orig_reader.min_calls_for_pop_stats
     random_reader = VCFReader(open(fpath),
                            min_calls_for_pop_stats=min_calls_for_pop_stats)
     return random_reader
Exemple #15
0
    def test_binary(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8
20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t703\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t2003\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
'''
        fhand = NamedTemporaryFile()
        fhand.write(VCF_HEADER + vcf)
        fhand.flush()
        out_fhand = NamedTemporaryFile()

        binary = join(VCF_BIN_DIR, 'filter_vcf_by_ld')
        cmd = [binary, '-o', out_fhand.name, fhand.name,
               '--no_bonferroni_correction', '--p_val', '0.03']
        process = Popen(cmd, stderr=PIPE)
        process.communicate()
        assert len(list(VCFReader(open(out_fhand.name)).parse_snvs())) == 3

        log_fhand = NamedTemporaryFile()
        binary = join(VCF_BIN_DIR, 'filter_vcf_by_ld')
        cmd = [binary, '-o', out_fhand.name, fhand.name,
               '--no_bonferroni_correction', '--p_val', '0.03',
               '-l', log_fhand.name]
        process = Popen(cmd, stderr=PIPE)
        process.communicate()
        assert 'filtered' in open(log_fhand.name).read()
Exemple #16
0
    def test_cap_enzyme_filter(self):
        seq_str = '>seq1\nATGATGATGgaaattcATGATGATGTGGGAT\n'
        seq_str += '>seq2\nATGATGATGATGATGATGTGGGAT\n'
        fhand = NamedTemporaryFile()
        fhand.write(seq_str)
        fhand.flush()
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
seq1\t11\trs6054257\tAA\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.
seq2\t12\t.\tA\tAA\t3\tq10\tNS=3;DP=11;AF=0.017\tGT:GQ:DP:HQ\t0|0:49:3:58,50\t0|1:3:5:65,3\t0/0:41:3
20\t1110696\trs6040355\tA\tG,T\t67\tPASS\tNS=2;DP=10;AF=0.333,0.667;AA=T;DB\tGT:GQ:DP:HQ\t1|2:21:6:23,27\t2|1:2:0:18,2\t2/2:35:4
20\t1230237\t.\tT\t.\t47\tPASS\tNS=3;DP=13;AA=T\tGT:GQ:DP:HQ\t0|0:54:7:56,60\t0|0:48:4:51,51\t0/0:61:2
20\t1234567\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t0/1:35:4\t0/2:17:2\t1/1:40:3
20\t1234567\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t./.:35:4\t0/2:17:2\t1/1:40:3
'''
        vcf = StringIO(VCF_HEADER + vcf)
        snps = list(VCFReader(vcf).parse_snvs())

        filter_ = CapEnzyme(all_enzymes=True, ref_fpath=fhand.name)
        assert filter_.name == 'cet'
        desc = 'SNV is not a CAP detectable by the enzymes: all'
        assert desc in filter_.description

        rec1 = snps[0].copy()
        filter_(rec1)
        assert filter_.name not in rec1.filters

        rec1 = snps[1].copy()
        filter_(rec1)
        assert filter_.name in rec1.filters
Exemple #17
0
    def test_bin(self):
        binary = join(VCF_BIN_DIR, 'filter_vcf_by_weird_segregation')
        cmd = [binary, '-h']
        process = Popen(cmd, stderr=PIPE, stdout=PIPE)
        stdout = process.communicate()[0]
        assert 'usage' in stdout

        vcf_fpath = os.path.join(TEST_DATA_DIR, 'scaff000025.vcf.gz')
        binary = join(VCF_BIN_DIR, 'filter_vcf_by_weird_segregation')
        cmd = [
            binary, '-n', '2', '-m', '200', '-s', '1_14_1_gbs', '-s',
            '1_17_1_gbs', '-s', '1_18_4_gbs', '-s', '1_19_4_gbs', '-s',
            '1_26_1_gbs', '-s', '1_27_1_gbs', '-s', '1_2_2_gbs', '-s',
            '1_35_13_gbs', '-s', '1_3_2_gbs', '-s', '1_50_1_gbs', '-s',
            '1_59_1_gbs', '-s', '1_63_4_gbs', '-s', '1_6_2_gbs', '-s',
            '1_70_1_gbs', '-s', '1_74_1_gbs', '-s', '1_79_1_gbs', '-s',
            '1_7_2_gbs', '-s', '1_81_10_gbs', '-s', '1_86_1_gbs', '-s',
            '1_8_2_gbs', '-s', '1_91_2_gbs', '-s', '1_94_4_gbs', '-s',
            '2_107_1_gbs', '-s', '2_10_2_gbs', '-s', '2_116_1_gbs', '-s',
            '2_11_1_gbs', '-s', '2_125_2_gbs', '-s', '2_13_1_gbs', vcf_fpath
        ]
        process2 = Popen(cmd, stderr=PIPE, stdout=PIPE)
        stdout, stderr = process2.communicate()
        assert len(list(VCFReader(StringIO(stdout)).parse_snvs())) == 273
        assert 'SNVs processed:' in stderr
Exemple #18
0
    def test_het_unknown(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8
20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t./.\t1/.\t
'''
        vcf = StringIO(VCF_HEADER + vcf)
        reader = VCFReader(vcf)
        snps = list(reader.parse_snvs())
        snp = snps[0]
        expected = [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [],
                    [1, None]]
        assert [call.int_alleles for call in snps[0].calls] == expected
        assert snp.num_called == 7
        out_fhand = StringIO()
        writer = VCFWriter(out_fhand, reader)
        for snv in snps:
            writer.write_snv(snv)
        assert '1/1\t./.\t1/.' in out_fhand.getvalue()
Exemple #19
0
    def test_mafs(self):
        vcf = open(join(TEST_DATA_DIR, 'freebayes_al_depth.vcf'))
        snps = list(VCFReader(vcf).parse_snvs())
        assert snps[0].maf_depth - 0.5 < 0.001
        assert snps[0].allele_depths == {0: 1, 1: 1}
        assert snps[0].depth == 2
        assert snps[1].maf_depth - 1.0 < 0.001
        assert snps[1].allele_depths == {0: 2, 1: 0}
        assert snps[4].maf_depth - 0.9890 < 0.001
        assert snps[4].allele_depths == {0: 90, 1: 1}
        assert snps[4].depth == 91

        result = [1, 1, 1, 1, 1, 0.944444]
        for call, res in zip(snps[4].calls, result):
            assert call.maf_depth - res < 0.001
        assert snps[0].mac

        snps[0].min_calls_for_pop_stats = 3
        assert snps[0].maf is None
        snps[3].min_calls_for_pop_stats = 3
        assert snps[3].maf - 0.75 < 0.0001
        snps[4].min_calls_for_pop_stats = 3
        assert snps[4].maf - 1.0 < 0.0001
        assert snps[0].mac == 2

        # varscan
        varscan_fhand = open(join(TEST_DATA_DIR, 'sample.vcf.gz'))
        reader = VCFReader(fhand=varscan_fhand)
        snp = list(reader.parse_snvs())[0]
        snp.min_calls_for_pop_stats = 1
        assert snp.maf_depth is None

        # gatk
        fhand = open(join(TEST_DATA_DIR, 'gatk_sample.vcf.gz'))
        reader = VCFReader(fhand=fhand)
        snp = list(reader.parse_snvs())[0]
        assert 0.7 < snp.maf_depth < 0.72
        assert 0.7 < snp.get_call('hib_amarillo').maf_depth < 0.72

        # freebayes
        fhand = open(join(TEST_DATA_DIR, 'freebayes_sample.vcf.gz'))
        reader = VCFReader(fhand=fhand)
        snp = list(reader.parse_snvs())[0]
        assert 0.99 < snp.maf_depth < 1.01
        assert 0.99 < snp.get_call('pep').maf_depth < 1.01
Exemple #20
0
    def test_rqtl_writer(self):
        vcf = StringIO(unicode(self.VCF_HEADER + self.vcf))
        snps = list(VCFReader(vcf).parse_snvs())

        fhand = StringIO()
        writer = RQTLWriter(fhand, phys_to_genet_dist=DEF_PHYS_TO_GENET_DIST)
        for snp in snps:
            writer.write(snp)
        assert fhand.getvalue() == self.expected
Exemple #21
0
    def test_empy_snv(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6
20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t./.\t./.\t./.\t./.\t./.\t./.
20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1'''

        vcf = StringIO(VCF_HEADER + vcf)
        snps = list(VCFReader(vcf).parse_snvs())
        call1 = snps[0].record.samples
        call2 = snps[1].record.samples
        counts = _count_biallelic_haplotypes(call1, call2)
        assert counts is None
Exemple #22
0
    def test_check_backwards(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8
20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t703\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t2003\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t2403\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
'''
        vcf = StringIO(VCF_HEADER + vcf)
        snps = VCFReader(vcf).parse_snvs()
        snvs = filter_snvs_by_ld(snps, p_val=0.03, bonferroni=False)
        assert [s.pos for s in snvs] == [1, 702, 2002, 2402]
Exemple #23
0
    def test_recomb_rate(self):
        # samples
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8
20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t3\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t4\t.\tG\tA\t29\tPASS\tNS=3\tGT\t1/1\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t1/1\t
20\t6\t.\tG\tA\t29\tPASS\tNS=3\tGT\t./.\t./.\t./.\t./.\t./.\t0/1\t0/1\t0/1\t
21\t4\t.\tG\tA\t29\tPASS\tNS=3\tGT\t1/1\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t1/1\t
'''
        vcf = StringIO(VCF_HEADER + vcf)
        snps = list(VCFReader(vcf).parse_snvs())

        recomb = _calc_recomb_rate(snps[0].record.samples,
                                   snps[1].record.samples,
                                   'ril_self')
        self.assertAlmostEqual(recomb, 0.0, 3)
        recomb = _calc_recomb_rate(snps[0].record.samples,
                                   snps[2].record.samples,
                                   'ril_self')
        self.assertAlmostEqual(recomb, 0.375, 3)
        recomb = _calc_recomb_rate(snps[0].record.samples,
                                   snps[2].record.samples,
                                   'test_cross')
        self.assertAlmostEqual(recomb, 0.5, 3)
        recomb = _calc_recomb_rate(snps[0].record.samples,
                                   snps[3].record.samples,
                                   'test_cross')
        assert recomb is None

        vcf = '''#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t1_14_1_gbs\t1_17_1_gbs\t1_18_4_gbs\t1_19_4_gbs\t1_26_1_gbs\t1_27_1_gbs1_2_2_gbs\t1_35_13_gbs\t1_3_2_gbs\t1_50_1_gbs\t1_59_1_gbs\t1_63_4_gbs\t1_6_2_gbs\t1_70_1_gbs\t1_74_1_gbs\t1_79_1_gbs\t1_7_2_gbs\t1_81_10_gbs\t1_86_1_gbs\t1_8_2_gbs\t1_91_2_gbs\t1_94_4_gbs\t2_107_1_gbs\t2_10_2_gbs\t2_116_1_gbs\t2_11_1_gbs\t2_125_2_gbs\t2_13_1_gbs\t2_16_3_gbs\t2_21_1_gbs\t2_22A_1_gbs\t2_24_2_gbs\t2_28_2_gbs\t2_31_2_gbs\t2_33_1_gbs\t2_39_3_gbs\t2_43_1_gbs2_5_1_gbs\t2_64_7_gbs\t2_67_2_gbs\t2_6_4_gbs\t2_84_2_gbs\t2_8_3_gbs\t2_95_2_gbs\t4_100B_4_gbs\t4_108_10_gbs\t4_110_11_gbs\t4_111_6_gbs\t4_115B_2_gbs\t4_11B_3_gbs\t4_123B_2_gbs\t4_127_6_gbs\t4_131_1_gbs\t4_136B_3_gbs\t4_136_10_T1_gbs\t4_138B_2_gbs\t4_26_11_gbs\t4_28_4_gbs\t4_33_2_gbs\t4_35_1_gbs\t4_38_2_gbs\t4_39_2_gbs\t4_41B_2_gbs\t4_42_11_gbs\t4_45_2_gbs\t4_53_2_gbs\t4_5_5_gbs\t4_62_4_gbs\t4_64B_1_gbs\t4_65_5_gbs\t4_66_2_gbs\t4_71_2_gbs\t4_72_1_gbs\t4_77_1_gbs\t4_7B_1_gbs\t4_7_2_gbs\t4_81B_2_gbs\t4_82B_4_gbs\t4_85_1_gbs\t4_95_1_gbs\t4_9_1_gbs\t5_14B_1_gbs\t5_15B_1_gbs\t5_18_1_gbs\t5_22_2_gbs\t5_24_2_gbs\t5_25_2_gbs\t5_32_3_gbs\t5_33B_4_gbs\t5_34B_2_gbs\t5_3_1_gbs\t5_40B_2_gbs\t5_49B_2_T1_gbs\t5_57_1_gbs\t5_58_1_gbs\t5_66_1_gbs\t5_80B_2_gbs\tMU_16_5_gbs\tV_196_2_gbs\t1\t2
s7\t4039693\tS7_4039693\tT\tG\t.\tPASS\tIV0=F\tGT\t0/0\t0/0\t0/0\t1/1\t0/0\t1/1\t1/1\t1/1\t1/1\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t1/1\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t1/1\t0/0\t0/0\t1/1\t1/1\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t./.\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0\t1/1\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t1/1\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t1/1
s7\t4028261\tS7_4028261\tC\tT\t.\tPASS\tIV0=F\tGT\t1/1\t1/1\t./.\t0/0\t1/1\t0/0\t./.\t0/0\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t0/0\t1/1\t0/0\t1/1\t1/1\t1/1\t0/0\t1/1\t1/1\t1/1\t1/1\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t0/0\t1/1\t0/0\t1/1\t1/1\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t1/1\t0/0\t0/0\t0/0\t0/0\t0/0\t./.\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/1\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t0/0\t1/1\t0/0
'''
        vcf = StringIO(VCF_HEADER + vcf)
        reader = VCFReader(vcf)
        snps = list(reader.parse_snvs())

        recomb = _calc_recomb_rate(snps[0].record.samples,
                                   snps[1].record.samples,
                                   'ril_self')
        self.assertAlmostEqual(recomb, 0.8187, 3)
Exemple #24
0
    def test_id(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT N1
20\t14370\tid1\tG\tA\t29\tPASS\tH2\tGT:GQ:DP:HQ\t0|0:48:1:51,51
20\t14371\tid2;id3\tG\tA\t29\tPASS\tH2\tGT:GQ:DP:HQ\t0|0:48:1:51,51
20\t14372\t.\tG\tA\t29\tPASS\tH2\tGT:GQ:DP:HQ\t0|0:48:1:51,51'''
        vcf = StringIO(VCF_HEADER + vcf)
        snps = list(VCFReader(vcf).parse_snvs())
        assert snps[0].ids == ['id1']
        assert snps[1].ids == ['id2', 'id3']
        assert not snps[2].ids
        assert snps[1].get_or_create_id() == 'id2'
        assert snps[2].get_or_create_id(prefix='snp_') == 'snp_20_14372'
Exemple #25
0
 def test_allele_depths(self):
     vcf = open(join(TEST_DATA_DIR, 'freebayes_al_depth.vcf'))
     snps = list(VCFReader(vcf).parse_snvs())
     snp = snps[0]
     result = [None, None, (1, 0), None, None, (0, 1)]
     for sample, res in zip(snp.calls, result):
         if res is None:
             assert sample.ref_depth is None
             assert not sample.allele_depths
         else:
             assert sample.ref_depth == res[0]
             assert sample.allele_depths[1] == res[1]
Exemple #26
0
    def test_no_allele_depths(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6
20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1'''

        vcf = StringIO(VCF_HEADER2 + vcf)
        snps = list(VCFReader(vcf, min_calls_for_pop_stats=4).parse_snvs())
        filter_ = LowEvidenceAlleleFilter(0.99)
        try:
            snps = [filter_(snp) for snp in snps]
            self.fail('RuntimeError expected')
        except RuntimeError:
            pass
Exemple #27
0
    def test_hetegorigot_percent(self):
        het_in_samples = HeterozigoteInSamples(filter_id=1)
        records = list(
            VCFReader(open(FREEBAYES4_VCF_PATH),
                      min_calls_for_pop_stats=1).parse_snvs())
        snp = records[0].copy()
        het_in_samples(snp)
        info_id = het_in_samples.info_id
        assert snp.infos[info_id] == 'True'

        snp = records[1].copy()
        het_in_samples = HeterozigoteInSamples(filter_id=1,
                                               gq_threshold=30,
                                               min_num_called=3)
        het_in_samples(snp)
        info_id = het_in_samples.info_id
        assert snp.infos[info_id] == 'None'

        het_in_samples = HeterozigoteInSamples(filter_id=1,
                                               gq_threshold=50,
                                               min_num_called=8)
        snp = records[0].copy()
        het_in_samples(snp)
        info_id = het_in_samples.info_id
        assert snp.infos[info_id] == 'None'

        het_in_samples = HeterozigoteInSamples(filter_id=1,
                                               gq_threshold=30,
                                               min_num_called=3,
                                               min_percent_het_gt=30)
        snp = records[0].copy()
        het_in_samples(snp)
        for snp in records:
            if snp.pos == 272668159 and snp.chrom == 'Pepper.v.1.55.chr01':
                snp = snp.copy()
                het_in_samples(snp)
                info_id = het_in_samples.info_id
                assert snp.infos[info_id] == 'False'
                break

        het_in_samples = HeterozigoteInSamples(
            filter_id=1,
            gq_threshold=30,
            min_num_called=3,
            min_percent_het_gt=30,
            samples=['sample05_gbs', 'sample06_gbs', 'sample07_gbs'])
        for snp in records:
            if snp.pos == 228123401 and snp.chrom == 'Pepper.v.1.55.chr10':
                snp = snp.copy()
                het_in_samples(snp)
                info_id = het_in_samples.info_id
                assert snp.infos[info_id] == 'None'
                break
Exemple #28
0
def run_genotype_filters(in_fhand, out_fhand, gt_filters, plots_dir=None, reader_kwargs=None):
    if reader_kwargs is None:
        reader_kwargs = {}

    reader_kwargs["filename"] = "pyvcf_bug_workaround"
    reader_kwargs["compressed"] = False
    reader = VCFReader(in_fhand, **reader_kwargs)

    templa_reader = VCFReader(StringIO(reader.header))
    writer = VCFWriter(out_fhand, template_reader=templa_reader)

    for snv in reader.parse_snvs():
        for mapper in gt_filters:
            snv = mapper(snv)
        try:
            writer.write_snv(snv)
        except IOError, error:
            # The pipe could be already closed
            if "Broken pipe" in str(error):
                break
            else:
                raise
Exemple #29
0
    def test_het_filter(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.
'''
        in_fhand = StringIO(VCF_HEADER + vcf)
        snps = list(VCFReader(in_fhand).parse_snvs())
        exp = [[0, 0], [1, 0], [1, 1]]
        assert [call.int_alleles for call in snps[0].calls] == exp
        res = [
            call.int_alleles
            for call in snps[0].remove_gt_from_het_calls().calls
        ]
        assert res == [[0, 0], [], [1, 1]]
Exemple #30
0
    def test_cache(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6 7 8
20\t2\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t703\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t2003\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t3003\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
20\t3403\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t0/0\t0/0\t0/0\t1/1\t1/1\t1/1\t1/1\t
'''
        vcf = StringIO(VCF_HEADER + vcf)
        snps = VCFReader(vcf).parse_snvs()
        snvs = filter_snvs_by_ld(snps, p_val=0.001, bonferroni=False,
                                 snv_win=3)
        assert not list(snvs)
Exemple #31
0
    def test_snv_read_pos_distrib(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
reference1\t187\trs6054257\tAA\tA\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.
reference1\t210\t.\tA\tAA\t3\tq10\tNS=3;DP=11;AF=0.017\tGT:GQ:DP:HQ\t0|0:49:3:58,50\t0|1:3:5:65,3\t0/0:41:3
reference1\t215\trs6040355\tA\tG,T\t67\tPASS\tNS=2;DP=10;AF=0.333,0.667;AA=T;DB\tGT:GQ:DP:HQ\t1|2:21:6:23,27\t2|1:2:0:18,2\t2/2:35:4
reference1\t230\t.\tT\t.\t47\tPASS\tNS=3;DP=13;AA=T\tGT:GQ:DP:HQ\t0|0:54:7:56,60\t0|0:48:4:51,51\t0/0:61:2
reference2\t350\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t0/1:35:4\t0/2:17:2\t1/1:40:3
reference2\t400\tmicrosat1\tGTC\tG,GTCT\t50\tPASS\tNS=3;DP=9;AA=G\tGT:GQ:DP\t./.:35:4\t0/2:17:2\t1/1:40:3
'''

        snvs = VCFReader(StringIO(vcf)).parse_snvs()
        bam_fpath = join(TEST_DATA_DIR, 'seqs.bam')
        sam = pysam.AlignmentFile(bam_fpath)
        stats = calc_snv_read_pos_stats(sam, snvs)
        assert 'group1+454' in stats['5_read_pos_counts'].keys()
        assert '5_read_pos_boxplot' in stats
        assert '3_read_pos_boxplot' in stats
        assert repr(
            stats['5_read_pos_counts']
        ) == """{'group1+454': IntCounter({24: 9, 1: 9, 44: 9, 29: 9}), 'group2+454': IntCounter({11: 9, 61: 6, 59: 3})}"""
        assert repr(
            stats['3_read_pos_counts']
        ) == """{'group1+454': IntCounter({73: 9, 50: 9, 45: 9, 30: 9}), 'group2+454': IntCounter({14: 6, 64: 3, 65: 3, 62: 3, 15: 3})}"""

        snvs = VCFReader(StringIO(vcf)).parse_snvs()
        bam_fpath = join(TEST_DATA_DIR, 'seqs.bam')
        sam = pysam.AlignmentFile(bam_fpath)
        stats = calc_snv_read_pos_stats2(sam, snvs)
        assert 'group1+454' in stats['5_read_pos_counts'].keys()
        assert '5_read_pos_boxplot' in stats
        assert '3_read_pos_boxplot' in stats
        assert repr(
            stats['5_read_pos_counts']
        ) == """{'group1+454': IntCounter({24: 9, 1: 9, 44: 9, 29: 9}), 'group2+454': IntCounter({11: 9, 61: 6, 59: 3})}"""
        assert repr(
            stats['3_read_pos_counts']
        ) == """{'group1+454': IntCounter({73: 9, 50: 9, 45: 9, 30: 9}), 'group2+454': IntCounter({14: 6, 64: 3, 65: 3, 62: 3, 15: 3})}"""
        fhand = NamedTemporaryFile(suffix='.png')
        draw_read_pos_stats(stats, fhand)
Exemple #32
0
    def test_no_geno_no_alle_freq(self):
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1 2 3 4 5 6
20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t./.\t./.\t./.\t./.\t./.\t./.
20\t14\t.\tG\tA\t29\tPASS\tNS=3\tGT\t0/0\t1/1\t1/1\t0/0\t0/0\t0/1'''

        vcf = StringIO(VCF_HEADER + vcf)
        snps = list(VCFReader(vcf).parse_snvs())
        filter_ = LowEvidenceAlleleFilter(0.99)
        snps = [filter_(snp) for snp in snps]
        expected = [False] * 12
        res = [call.called for snp in snps for call in snp.calls]
        assert filter_.log == {'not_enough_individuals': 12, 'tot': 12}
        assert expected == res
Exemple #33
0
    def test_complexity_filter(self):
        records = list(VCFReader(open(VCF_PATH)).parse_snvs())
        low_complexity = LowComplexityRegionAnnotator(ref_fpath=REF_PATH)

        snp = records[0].copy()
        low_complexity(snp)
        assert low_complexity.name not in snp.filters

        snp1 = records[1]
        low_complexity(snp1)
        assert low_complexity.name not in snp1.filters
        fhand = NamedTemporaryFile(suffix='.png')
        low_complexity.draw_hist(fhand)
Exemple #34
0
    def test_amino_change_filter(self):
        seq_ref = """>SEUC00016_TC01
    CACGCTAAACAACGATCATTGTCATCGGTACCGATTGTTACAAGTTGTGTGCAGTGTCGT
    GCTATTTGTGTGTACATTCCTTCTAAGATGTCGTCAACAAAGTGGTTGGTGTGTGCGCTA
    GTGGTGGTGTGCGTGAGCGTAAGGCAAGCAACATCTGCGCCGGCGCCGCAGGAACAAGAA
    TACCCGCCTATGCCCTACGAGTACAAATATGACGTTGAAGATCAAGAGCTTGAAGAGAAA
    GCTCTCTACTTCGGAGCCAACGAAGCAGGAGATGCCCAGGGCAAGGTCATCGGAGGATAC
    CGAGTTCTCCTCCCCGATGGTCGTCTTATGACCGTCGAGTACAGTGTGGAGGGAGAAAGC
    GGTTTCGTTCCCAAAATCACCTTCGAAGACAACGCCAGCCCCTTCGGCAAAGGAAAGTAG
    ACCTTATAACGACGCCTACAAGACTGGTACCGCGATCAATTGATACTAGTTCAATTTGAT
    TTCTGAATTCTATGCCGTAAAACATTTTCTTTTATTAATTATACCGATTTCGATAAATAG
    ACATCTTTACCTACTTAACGAATTTCTCATAGGATTCAGAAGTCGAAACCGAAAAAAGTT
    ACTTCAGTTTTCATTAGATTGTAAATGTGTGTAAATTATTATTATTATTATATCAGGGAT
    CCTTAAGTTGATATTAGTGGTGATATAAACGATATTTATGAACGACAATCAGGTATCGTC
    ACTGGCTTGAGTAATGTTAGAAAAAATATAATTTTACCGAAAGCATTAGTAACTTTTTTC
    ACGATTATAATCTCCCATACATACTGTATACTTACGTTACGTATAATAATTTTGATTGTC
    TTCATAGTGTACTCTATAATATATGTAGGTGTAGGCAAAACTCATTCGCCAATAAGATAA
    TATGTACAGTCAGCGATTTCTAAGATAAATTTGTACCGCAAATATCGAGTTACCGATACT
    GTGATCAATTAGAACG"""
        orf_seq = '''>SEUC00016_TC01_orf_seq start=89 end=421 strand=forward
    ATGTCGTCAACAAAGTGGTTGGTGTGTGCGCTAGTGGTGGTGTGCGTGAGCGTAAGGCAAGCAACATCTGCGC
    CGGCGCCGCAGGAACAAGAATACCCGCCTATGCCCTACGAGTACAAATATGACGTTGAAGATCAAGAGCTTGAA
    GAGAAAGCTCTCTACTTCGGAGCCAACGAAGCAGGAGATGCCCAGGGCAAGGTCATCGGAGGATACCGAGTTCT
    CCTCCCCGATGGTCGTCTTATGACCGTCGAGTACAGTGTGGAGGGAGAAAGCGGTTTCGTTCCCAAAATCACCT
    TCGAAGACAACGCCAGCCCCTTCGGCAAAGGAAAGTAG'''
        ref_fhand = NamedTemporaryFile()
        ref_fhand.write(seq_ref)
        ref_fhand.flush()

        orf_fhand = NamedTemporaryFile()
        orf_fhand.write(orf_seq)
        orf_fhand.flush()
        vcf = '''#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
SEUC00016_TC01\t112\trs6054257\tT\tC\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.
'''
        vcf = StringIO(VCF_HEADER + vcf)
        snps = list(VCFReader(vcf).parse_snvs())
        f = AminoChangeAnnotator(ref_fpath=ref_fhand.name,
                                 orf_seq_fpath=orf_fhand.name)

        snv = snps[0].copy()
        f(snv)
        assert f.name in snv.filters
        assert snv.infos['AAC'] == 'C->R'

        f = AminoSeverityChangeAnnotator(ref_fpath=ref_fhand.name,
                                         orf_seq_fpath=orf_fhand.name)
        record = snps[0].copy()
        f(record)
        assert f.name in record.filters
Exemple #35
0
    def __init__(self, vcf_fpath, gq_threshold=None, dp_threshold=100,
                 min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS,
                 remarkable_coverages=None, window_size=WINDOWS_SIZE):
        if remarkable_coverages is None:
            remarkable_depths = REMARKABLE_DEPTHS
        self.remarkable_depths = remarkable_depths

        self._reader = VCFReader(open(vcf_fpath),
                               min_calls_for_pop_stats=min_calls_for_pop_stats)

        self._random_reader = pyvcfReader(filename=vcf_fpath)

        self.window_size = window_size
        self._gq_threshold = 0 if gq_threshold is None else gq_threshold

        self.dp_threshold = dp_threshold
        self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()}
        self._ac2d = _AlleleCounts2D()

        self.sample_dp_coincidence = {1: IntCounter()}
        for cov in remarkable_depths:
            self.sample_dp_coincidence[cov] = IntCounter()

        self.called_snvs = 0
        self.called_gts = IntCounter()

        # sample_counter
        self._sample_counters = {}

        for counter_name in SAMPLE_COUNTERS:
            if counter_name not in self._sample_counters:
                self._sample_counters[counter_name] = {}
            for sample in self._reader.samples:
                if counter_name in (GT_DEPTHS, GT_QUALS):
                    counters = {HOM: IntCounter(), HET: IntCounter()}
                else:
                    counters = IntCounter()
                self._sample_counters[counter_name][sample] = counters

        self._snv_counters = {MAFS: IntCounter(),
                              MACS: IntCounter(),
                              MAFS_DP: IntCounter(),
                              SNV_QUALS: IntCounter(),
                              HET_IN_SNP: IntCounter(),
                              SNV_DENSITY: IntCounter(),
                              INBREED_F_IN_SNP: IntCounter(),
                              DEPTHS: IntCounter()}
        self._calculate()
Exemple #36
0
 def test_header(self):
     varscan = open(join(TEST_DATA_DIR, 'sample.vcf.gz'))
     header = VCFReader(varscan).header
     assert '##fileformat=VCFv4.1' in header
     assert '#CHROM' in header
     assert len([line for line in header.split('\n')]) == 24
Exemple #37
0
    def test_sliding_window(self):
        fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz'))
        reader = VCFReader(fhand=fhand)
        # snps in this vcf [9, 19, 29, 0, 11, 20]
        windows = list(reader.sliding_windows(size=10, min_num_snps=1))
        assert [snp.pos for snp in windows[0]['snps']] == [9]
        assert [snp.pos for snp in windows[1]['snps']] == [19]
        assert [snp.pos for snp in windows[2]['snps']] == [29]
        assert [snp.pos for snp in windows[3]['snps']] == [0]
        assert [snp.pos for snp in windows[4]['snps']] == [11]

        windows = list(reader.sliding_windows(size=20, min_num_snps=1))
        assert [snp.pos for snp in windows[0]['snps']] == [9, 19]
        assert [snp.pos for snp in windows[1]['snps']] == [0, 11]

        ref = '>CUUC00007_TC01\nCTGATGCTGATCGTGATCGAGTCGTAGTCTAGTCGATGTCGACG\n'
        ref += '>CUUC00029_TC01\nCTGATGCTGATCGTGATCGAGTCGTAGTCTAGTCGATGTCGAA\n'
        fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz'))
        reader = VCFReader(fhand=fhand)
        windows = list(reader.sliding_windows(size=10, min_num_snps=1,
                                              ref_fhand=StringIO(ref)))
        assert [snp.pos for snp in windows[0]['snps']] == [9]
        assert [snp.pos for snp in windows[1]['snps']] == [19]
        assert [snp.pos for snp in windows[2]['snps']] == [29]
        assert [snp.pos for snp in windows[3]['snps']] == [0]
        assert [snp.pos for snp in windows[4]['snps']] == [11]
        assert [snp.pos for snp in windows[5]['snps']] == [20]

        # with fasta
        fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz'))
        reader = VCFReader(fhand=fhand)
        windows = list(reader.sliding_windows(size=20, min_num_snps=1,
                                              ref_fhand=StringIO(ref)))
        assert [snp.pos for snp in windows[0]['snps']] == [9, 19]
        assert [snp.pos for snp in windows[1]['snps']] == [29]
        assert [snp.pos for snp in windows[2]['snps']] == [0, 11]
        assert [snp.pos for snp in windows[3]['snps']] == [20]

        # we skip windows that have no snps
        fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz'))
        reader = VCFReader(fhand=fhand)
        windows = list(reader.sliding_windows(size=5, min_num_snps=1,
                                              ref_fhand=StringIO(ref)))
        assert [snp.pos for snp in windows[0]['snps']] == [9]
        assert [snp.pos for snp in windows[1]['snps']] == [19]
        assert [snp.pos for snp in windows[2]['snps']] == [29]
        assert [snp.pos for snp in windows[3]['snps']] == [0]
        assert [snp.pos for snp in windows[4]['snps']] == [11]
        assert [snp.pos for snp in windows[5]['snps']] == [20]

        # we skip no window
        fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz'))
        reader = VCFReader(fhand=fhand)
        windows = list(reader.sliding_windows(size=5, min_num_snps=0,
                                              ref_fhand=StringIO(ref)))
        assert [snp.pos for snp in windows[0]['snps']] == []
        assert [snp.pos for snp in windows[1]['snps']] == [9]
        assert [snp.pos for snp in windows[2]['snps']] == []
        assert [snp.pos for snp in windows[3]['snps']] == [19]

        fhand = open(join(TEST_DATA_DIR, 'sample_to_window.vcf.gz'))
        reader = VCFReader(fhand=fhand)
        windows = list(reader.sliding_windows(size=10, min_num_snps=0,
                                              ref_fhand=StringIO(ref),
                                              step=5))
        assert [snp.pos for snp in windows[0]['snps']] == [9]
        assert [snp.pos for snp in windows[1]['snps']] == [9]
        assert [snp.pos for snp in windows[2]['snps']] == [19]
        assert [snp.pos for snp in windows[3]['snps']] == [19]
Exemple #38
0
class VcfStats(object):
    def __init__(self, vcf_fpath, gq_threshold=None, dp_threshold=100,
                 min_calls_for_pop_stats=DEF_MIN_CALLS_FOR_POP_STATS,
                 remarkable_coverages=None, window_size=WINDOWS_SIZE):
        if remarkable_coverages is None:
            remarkable_depths = REMARKABLE_DEPTHS
        self.remarkable_depths = remarkable_depths

        self._reader = VCFReader(open(vcf_fpath),
                               min_calls_for_pop_stats=min_calls_for_pop_stats)

        self._random_reader = pyvcfReader(filename=vcf_fpath)

        self.window_size = window_size
        self._gq_threshold = 0 if gq_threshold is None else gq_threshold

        self.dp_threshold = dp_threshold
        self._gt_qual_depth_counter = {HOM: IntBoxplot(), HET: IntBoxplot()}
        self._ac2d = _AlleleCounts2D()

        self.sample_dp_coincidence = {1: IntCounter()}
        for cov in remarkable_depths:
            self.sample_dp_coincidence[cov] = IntCounter()

        self.called_snvs = 0
        self.called_gts = IntCounter()

        # sample_counter
        self._sample_counters = {}

        for counter_name in SAMPLE_COUNTERS:
            if counter_name not in self._sample_counters:
                self._sample_counters[counter_name] = {}
            for sample in self._reader.samples:
                if counter_name in (GT_DEPTHS, GT_QUALS):
                    counters = {HOM: IntCounter(), HET: IntCounter()}
                else:
                    counters = IntCounter()
                self._sample_counters[counter_name][sample] = counters

        self._snv_counters = {MAFS: IntCounter(),
                              MACS: IntCounter(),
                              MAFS_DP: IntCounter(),
                              SNV_QUALS: IntCounter(),
                              HET_IN_SNP: IntCounter(),
                              SNV_DENSITY: IntCounter(),
                              INBREED_F_IN_SNP: IntCounter(),
                              DEPTHS: IntCounter()}
        self._calculate()

    def _add_depth(self, snp):
        depth = snp.depth
        if depth is None:
            depth = 0
        self._snv_counters[DEPTHS][depth] += 1

    def _add_maf_and_mac(self, snp):
        maf = snp.maf
        if maf:
            maf = int(round(maf * 100))
            self._snv_counters[MAFS][maf] += 1
        mac = snp.mac
        if mac:
            self._snv_counters[MACS][mac] += 1

    def _add_maf_dp(self, snp):
        maf_dp = snp.maf_depth
        if maf_dp is not None:
            self._snv_counters[MAFS_DP][int(round(maf_dp * 100))] += 1
        for call in snp.calls:
            maf_dp = call.maf_depth
            if maf_dp is None:
                continue
            sample = call.sample
            maf_depth = int(round(maf_dp * 100))
            self._sample_counters[MAFS_DP][sample][maf_depth] += 1

    def _add_snv_qual(self, snp):
        snv_qual = snp.qual
        if snv_qual is not None:
            self._snv_counters[SNV_QUALS][int(round(snv_qual))] += 1

    def _add_snv_density(self, snp):
        windows_size = self.window_size
        pos = snp.pos
        start = pos - windows_size if pos - windows_size > windows_size else 0
        end = pos + windows_size
        chrom = snp.chrom
        num_snvs = len(list(self._random_reader.fetch(chrom, start, end))) - 1

        self._snv_counters[SNV_DENSITY][num_snvs] += 1

    def _add_snv_het_obs_fraction(self, snp):
        obs_het = snp.obs_het
        if obs_het is None:
            return
        self._snv_counters[HET_IN_SNP][int(round(obs_het * 100))] += 1

        inbreed_coef = snp.inbreed_coef
        if inbreed_coef is None:
            return
        inbreed_coef = int(round(inbreed_coef * 100))
        self._snv_counters[INBREED_F_IN_SNP][inbreed_coef] += 1

    @staticmethod
    def _num_samples_higher_equal_dp(depth, snp):
        n_samples = 0
        for call in snp.calls:
            if not call.called:
                continue
            if call.depth >= depth:
                n_samples += 1
        return n_samples

    def _calculate(self):
        snp_counter = 0
        for snp in self._reader.parse_snvs():
            snp_counter += 1
            self._add_maf_dp(snp)
            self._add_maf_and_mac(snp)
            self._add_snv_qual(snp)
            self._add_snv_density(snp)
            self._add_snv_het_obs_fraction(snp)
            self._add_depth(snp)

            for depth, counter in self.sample_dp_coincidence.viewitems():
                n_samples = self._num_samples_higher_equal_dp(depth, snp)
                counter[n_samples] += 1

            n_gt_called = 0
            for call in snp.calls:
                if not call.called:
                    continue
                n_gt_called += 1
                sample_name = call.sample
                ref_depth = call.ref_depth
                acs = call.alt_sum_depths
                gt_type = call.gt_type

                gt_broud_type = HET if call.is_het else HOM

                depth = call.depth
                gt_qual = call.gt_qual
                if depth is not None and depth < self.dp_threshold:
                    self._gt_qual_depth_counter[gt_broud_type].append(depth,
                                                                      gt_qual)
                # CHECK THIS. This is an special case where the only info we
                # have is the genotype
                if gt_qual is None:
                    self._sample_counters[GT_TYPES][sample_name][gt_type] += 1
                    if depth is not None:
                        self._sample_counters[GT_DEPTHS][sample_name][gt_broud_type][depth] += 1
                elif gt_qual >= self._gq_threshold:
                    self._sample_counters[GT_TYPES][sample_name][gt_type] += 1
                    self._sample_counters[GT_QUALS][sample_name][gt_broud_type][gt_qual] += 1
                    self._sample_counters[GT_DEPTHS][sample_name][gt_broud_type][depth] += 1
                self._ac2d.add(rc=ref_depth, acs=acs, gt=call.int_alleles,
                               gq=gt_qual)
            self.called_gts[n_gt_called] += 1
            self.called_snvs += 1

    def _get_sample_counter(self, kind, sample=None, gt_broud_type=None):
        counters = self._sample_counters[kind]
        if sample is not None:
            if gt_broud_type is None:
                return counters[sample]
            else:
                return counters[sample][gt_broud_type]
        all_counters = IntCounter()
        for sample_counter in counters.values():
            if gt_broud_type is None:
                all_counters += sample_counter
            else:
                all_counters += sample_counter[gt_broud_type]
        return all_counters

    def macs(self):
        return self._snv_counters[MACS]

    def mafs(self):
        return self._snv_counters[MAFS]

    def mafs_dp(self, sample=None):
        if sample is None:
            return self._snv_counters[MAFS_DP]
        return self._get_sample_counter(MAFS_DP, sample)

    def gt_depths(self, gt_broud_type, sample=None):
        return self._get_sample_counter(GT_DEPTHS, sample,
                                        gt_broud_type=gt_broud_type)

    def gt_quals(self, gt_broud_type, sample=None):
        return self._get_sample_counter(GT_QUALS, sample,
                                        gt_broud_type=gt_broud_type)

    def heterozigosity_for_sample(self, sample):
        sample_gt_types = self._get_sample_counter(GT_TYPES, sample)

        het_gt = sample_gt_types[HET]
        all_gts = sample_gt_types.count
        try:
            heterozigosity = het_gt / all_gts
        except ZeroDivisionError:
            heterozigosity = 0
        return heterozigosity

    def gt_types(self, sample=None):
        return self._get_sample_counter(GT_TYPES, sample)

    @property
    def samples(self):
        return self._reader.samples

    @property
    def min_calls_for_pop_stats(self):
        return self._reader.min_calls_for_pop_stats

    @property
    def snv_density(self):
        return self._snv_counters[SNV_DENSITY]

    @property
    def snv_quals(self):
        return self._snv_counters[SNV_QUALS]

    @property
    def het_by_snp(self):
        return self._snv_counters[HET_IN_SNP]

    @property
    def inbreeding_by_snp(self):
        return self._snv_counters[INBREED_F_IN_SNP]

    @property
    def allelecount2d(self):
        return self._ac2d

    @property
    def gt_depths_by_gt_and_qual(self):
        return self._gt_qual_depth_counter

    @property
    def depths(self):
        return self._snv_counters[DEPTHS]
Exemple #39
0
    def filter_vcf(self, vcf_fpath, min_samples=DEF_MIN_CALLS_FOR_POP_STATS):
        reader = VCFReader(open(vcf_fpath),
                           min_calls_for_pop_stats=min_samples)
        snvs = reader.parse_snvs()
        random_reader = VCFReader(open(vcf_fpath))

        for snv_1 in snvs:
            self.tot_snps += 1
            loc = snv_1.pos

            if self.plot_dir:
                chrom = str(snv_1.chrom)
                fname = chrom + '_' + str(loc) + '.png'
                chrom_dir = pjoin(self.plot_dir, chrom)
                if not exists(chrom_dir):
                    mkdir(chrom_dir)
                plot_fhand = open(pjoin(chrom_dir, fname), 'w')
                debug_plot_info = []
            else:
                plot_fhand = None

            win_1_start = loc - (self.win_width / 2)
            if win_1_start < 0:
                win_1_start = 0
            win_1_end = loc - (self.win_mask_width / 2)
            if win_1_end < 0:
                win_1_end = 0
            if win_1_end != 0:
                snvs_win_1 = random_reader.fetch_snvs(snv_1.chrom,
                                                      start=int(win_1_start),
                                                      end=int(win_1_end))
            else:
                snvs_win_1 = []

            win_2_start = loc + (self.win_mask_width / 2)
            win_2_end = loc + (self.win_width / 2)
            snvs_win_2 = random_reader.fetch_snvs(snv_1.chrom,
                                                  start=win_2_start,
                                                  end=win_2_end)
            snvs_in_win = list(chain(snvs_win_1, snvs_win_2))
            if len(snvs_in_win) > self.num_snvs_check:
                snvs_in_win = random.sample(snvs_in_win, self.num_snvs_check)
            if len(snvs_in_win) < self.min_num_snvs_check_in_win:
                # Not enough snps to check
                continue

            orig_snp = snv_1
            if self.samples is not None:
                snv_1 = snv_1.filter_calls_by_sample(self.samples)

            exp_cnts = snv_1.biallelic_genotype_counts

            if exp_cnts is None:
                continue

            test_values = []
            for snv_2 in snvs_in_win:
                if self.samples is not None:
                    snv_2 = snv_2.filter_calls_by_sample(self.samples)
                obs_cnts = snv_2.biallelic_genotype_counts
                if obs_cnts is None:
                    continue
                test_values.append(_fisher_extact_rxc(obs_cnts, exp_cnts))

                if plot_fhand:
                    debug_plot_info.append({'pos': snv_2.pos,
                                            'AA': obs_cnts[0],
                                            'Aa': obs_cnts[1],
                                            'aa': obs_cnts[2],
                                            'close_snp': True})
            alpha2 = self.alpha/len(test_values)
            results = []
            for idx, val in enumerate(test_values):
                result = False if val is None else val > alpha2
                results.append(result)

                if plot_fhand:
                    debug_plot_info[idx]['result'] = result

            if len(test_values) < self.min_num_snvs_check_in_win:
                # few snps can be tested for segregation
                continue

            tot_checked = len(test_values)
            if tot_checked > 0:
                failed_freq = results.count(False) / tot_checked
                passed = self.max_failed_freq > failed_freq
            else:
                failed_freq = None
                passed = False
            if failed_freq is not None:
                self._failed_freqs.append(failed_freq)

            if plot_fhand:
                debug_plot_info.append({'pos': snv_1.pos,
                                        'AA': exp_cnts[0],
                                        'Aa': exp_cnts[1],
                                        'aa': exp_cnts[2],
                                        'result': passed,
                                        'close_snp': False})
                self._plot_segregation_debug(debug_plot_info, plot_fhand)
            if passed:
                self.passed_snps += 1
                yield orig_snp