コード例 #1
0
    def test_num_private_alleles(self):
        stat_funct = partial(calc_number_of_private_alleles,
                             min_num_genotypes=0)

        gts = numpy.array([[[0], [0], [0], [0], [-1]],
                           [[0], [0], [1], [1], [-1]],
                           [[0], [2], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 1, 1, 0], 2: [0, 1, 1, 0]}

        self._check_function(stat_funct, varis, pops, expected)

        # No missing alleles
        gts = numpy.array([[[0], [0], [0], [0], [1]], [[0], [0], [1], [1],
                                                       [1]],
                           [[0], [2], [0], [1], [1]], [[1], [1], [0], [0],
                                                       [2]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 1, 1, 1], 2: [1, 1, 1, 2]}
        self._check_function(stat_funct, varis, pops, expected)

        # all missing
        gts = numpy.array([[[0], [0], [0], [-1], [-1]],
                           [[0], [0], [1], [-1], [-1]],
                           [[0], [2], [-1], [-1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 1, 2, 0], 2: [0, 1, 0, 0]}
        self._check_function(stat_funct, varis, pops, expected)

        # min_num_genotypes
        stat_funct = partial(calc_number_of_private_alleles,
                             min_num_genotypes=2)
        gts = numpy.array([[[0], [0], [0], [0], [1]], [[0], [0], [1], [1],
                                                       [1]],
                           [[0], [2], [0], [1], [1]], [[1], [-1], [0], [0],
                                                       [2]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 1, 1, 0], 2: [1, 1, 1, 0]}
        self._check_function(stat_funct, varis, pops, expected)
コード例 #2
0
    def test_empty_pop(self):
        missing = (-1, -1)
        gts = [
            [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        snps = VariationsArrays()
        snps['/calls/GT'] = numpy.array(gts)
        snps.samples = samples

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0)
        assert numpy.allclose(dists, [0.65490196])

        gts = [
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        snps = VariationsArrays()
        snps['/calls/GT'] = numpy.array(gts)
        snps.samples = samples

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0)
        assert numpy.isnan(dists[0])
コード例 #3
0
    def test_num_alleles(self):
        stat_funct = partial(calc_number_of_alleles, min_num_genotypes=0)

        gts = numpy.array([[[0], [0], [0], [0], [-1]],
                           [[0], [0], [1], [1], [-1]],
                           [[0], [0], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [1, 1, 1, 0], 2: [1, 1, 2, 0]}

        self._check_function(stat_funct, varis, pops, expected)

        # a population empty
        gts = numpy.array([[[-1], [-1], [0], [0], [-1]],
                           [[-1], [-1], [1], [1], [-1]],
                           [[-1], [-1], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 0, 0, 0], 2: [1, 1, 2, 0]}
        self._check_function(stat_funct, varis, pops, expected)

        # only one pop
        gts = numpy.array([[[1], [-1], [0], [0], [-1]],
                           [[-1], [-1], [1], [1], [-1]],
                           [[-1], [-1], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2]}
        expected = {1: [1, 0, 0, 0]}
        self._check_function(stat_funct, varis, pops, expected)

        # min num genotypes
        stat_funct = partial(calc_number_of_alleles, min_num_genotypes=3)
        gts = numpy.array([[[1], [-1], [0], [0], [-1]],
                           [[-1], [-1], [1], [1], [-1]],
                           [[-1], [-1], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2, 3, 4, 5]}
        expected = {1: [2, 0, 0, 0]}
        self._check_function(stat_funct, varis, pops, expected)
コード例 #4
0
    def test_nei_dist(self):

        gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                           [[1, 1], [1, 2], [2, 2], [2, 1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        pops = [[1, 2], [3, 4]]
        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1,
                                                      chunk_size=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        # all missing
        gts = numpy.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        pops = [[1, 2], [3, 4]]
        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1)
        assert math.isnan(dists[0])

        # min_num_genotypes
        gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                           [[1, 1], [1, 2], [2, 2], [2, 1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        pops = [[1, 2], [3, 4]]
        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      chunk_size=1)
        assert math.isnan(dists[0])
コード例 #5
0
    def test_dest_jost_distance(self):

        gts = [[(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2),
                (2, 4), (4, 4), (-1, -1)],
               [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2),
                (2, 4), (4, 4), (-1, -1)]]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        snps = VariationsArrays()
        snps['/calls/GT'] = numpy.array(gts)
        snps.samples = samples

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0)
        assert numpy.allclose(dists, [0.65490196])

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0,
                                  chunk_size=1)
        assert numpy.allclose(dists, [0.65490196])

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=6,
                                  chunk_size=1)
        assert numpy.all(numpy.isnan(dists))
コード例 #6
0
    def test_excel(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]],
                           [[2, 2], [2, 0], [2, 1], [0, 0], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]],
                           [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts
        variations.samples = list(range(gts.shape[1]))

        fhand = NamedTemporaryFile(suffix='.xlsx')
        write_excel(variations, fhand)

        # chrom pos
        variations[CHROM_FIELD] = numpy.array([1, 1, 2, 2])
        variations[POS_FIELD] = numpy.array([10, 20, 10, 20])
        fhand = NamedTemporaryFile(suffix='.xlsx')
        write_excel(variations, fhand)

        # REF, ALT
        variations[REF_FIELD] = numpy.array([b'A', b'A', b'A', b'A'])
        variations[ALT_FIELD] = numpy.array([[b'T'], [b'T'], [b'T'], [b'T']])
        write_excel(variations, fhand)

        # with classifications
        classes = [1, 1, 1, 2, 2]
        write_excel(variations, fhand, classes)
コード例 #7
0
    def test_report(self):
        gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]],
                           [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}

        out_dir = tempfile.TemporaryDirectory()
        create_pop_stats_report(varis,
                                pops,
                                out_dir.name,
                                min_num_genotypes=1,
                                min_call_dp_for_obs_het=0,
                                violin_ylimits={
                                    'observed_heterozigosity': {
                                        'bottom': 0,
                                        'top': 0.5
                                    }
                                })
        stats_csv_fpath = os.path.join(out_dir.name, 'pop_stats.csv')
        assert os.path.exists(stats_csv_fpath)
        stats_csv_fpath = os.path.join(out_dir.name,
                                       'pop_stats_violin_plots.svg')
        # input(out_dir.name)
        out_dir.cleanup()
コード例 #8
0
    def test_calc_obs_het(self):
        stat_funct = calc_obs_het

        gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]],
                           [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        dps = numpy.array([[20, 15, 20, 20, 20], [20, 20, 20, 20, 20],
                           [20, 20, 20, 20, 20]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis[DP_FIELD] = dps
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0.5, 0, math.nan], 2: [0, 1., math.nan]}
        partial_stat_funct = partial(stat_funct,
                                     min_num_genotypes=1,
                                     min_call_dp=0)
        self._check_function(partial_stat_funct, varis, pops, expected)

        # now setting a depth_threshold
        expected = {1: [0, 0, math.nan], 2: [0, 1., math.nan]}
        partial_stat_funct = partial(stat_funct,
                                     min_call_dp=20,
                                     min_num_genotypes=1)
        self._check_function(partial_stat_funct, varis, pops, expected)
コード例 #9
0
    def test_calc_distrib_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP',
                                                     sample='1_17_1_gbs',
                                                     n_bins=15)
        assert distrib.shape == (15,)

        distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=None)
        assert numpy.all(distrib == distrib2)

        distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=50)
        assert numpy.all(distrib3 == distrib2)

        vars_ = VariationsArrays()
        vars_['/calls/DP'] = numpy.array([[10, 5, 15],
                                          [0, 15, 10]])
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]],
                                          [[0, 0], [0, 1], [1, 1]]])
        vars_.samples = list(range(3))
        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_het)
        expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_hom)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
コード例 #10
0
    def test_calc_distrib_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP',
                                                     sample='1_17_1_gbs',
                                                     n_bins=15)
        assert distrib.shape == (15,)

        distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=None)
        assert numpy.all(distrib == distrib2)

        distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=50)
        assert numpy.all(distrib3 == distrib2)

        vars_ = VariationsArrays()
        vars_['/calls/DP'] = numpy.array([[10, 5, 15],
                                          [0, 15, 10]])
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]],
                                          [[0, 0], [0, 1], [1, 1]]])
        vars_.samples = list(range(3))
        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_het)
        expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_hom)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
コード例 #11
0
    def test_gst_basic(self):
        ad = [[[10, 3, -1], [11, 2, -1]], [[10, 0, -1], [10, 0, -1]],
              [[10, 10, -1], [11, 11, -1]], [[-1, 2, 10], [-1, 10, 2]]]

        snps = VariationsArrays()
        snps.samples = [1, 2]
        populations = [[1], [2]]
        snps[AD_FIELD] = numpy.array(ad)
        dist = calc_gst_per_loci(snps, populations)
        expected = numpy.array([0.00952381, 0, 0, 0.44444444])
        numpy.testing.assert_almost_equal(dist, expected)
コード例 #12
0
    def test_is_variable_func(self):
        variations = VariationsArrays()
        gts = numpy.array([[[-1, -1], [1, 1], [0, 1], [1, 1], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                           [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                           [[0, 0], [0, 0], [0, 1], [0, 0], [0, 0]]])
        variations[GT_FIELD] = gts
        variations.samples = [1, 2, 3, 4, 5]

        expected_variable = [MISSING_INT, TRUE_INT, TRUE_INT, FALSE_INT]
        variable = is_variable(variations, samples=[1, 5])
        assert numpy.all(variable == expected_variable)
コード例 #13
0
    def test_calc_exp_het(self):
        stat_funct = calc_exp_het

        gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]],
                           [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0.5, 0, math.nan], 2: [0, 0.6, math.nan]}
        partial_stat_funct = partial(stat_funct, min_num_genotypes=1)
        self._check_function(partial_stat_funct, varis, pops, expected)
コード例 #14
0
    def test_calc_major_allele_freq(self):
        stat_funct = calc_major_allele_freq

        gts = numpy.array([[[0], [0], [0], [0], [-1]],
                           [[0], [0], [1], [1], [-1]], [[0], [2], [0], [1],
                                                        [2]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [1., 1., 0.5, math.nan], 2: [1., 1., 1 / 3, math.nan]}
        stat_funct = partial(stat_funct, min_num_genotypes=1)
        self._check_function(stat_funct, varis, pops, expected)
コード例 #15
0
    def test_nei_dist(self):
        gts = [[[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 0]],
               [[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]],
               [[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]]]
        snps = VariationsArrays()
        snps['/calls/GT'] = numpy.array(gts)
        snps.samples = [1, 2, 3, 4, 5, 6, 7]

        pops = [[1, 2, 3], [4, 5, 6, 7]]
        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='nei',
                                  min_num_genotypes=0)
        assert dists[0] - 3.14019792 < 0.001
        pops = [[1, 2, 3], [1, 2, 3]]
        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='nei',
                                  min_num_genotypes=0)
        assert dists[0] - 0 < 0.001
        pops = [[1, 2, 3], [1, 4, 5, 6, 7]]
        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='nei',
                                  min_num_genotypes=0)
        assert dists[0] - 1.23732507 < 0.001

        # by chunk
        pops = [[1, 2, 3], [4, 5, 6, 7]]
        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='nei',
                                  chunk_size=2,
                                  min_num_genotypes=0)
        assert dists[0] - 3.14019792 < 0.001
        pops = [[1, 2, 3], [1, 2, 3]]
        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='nei',
                                  chunk_size=2,
                                  min_num_genotypes=0)
        assert dists[0] - 0 < 0.001
        pops = [[1, 2, 3], [1, 4, 5, 6, 7]]
        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='nei',
                                  chunk_size=2,
                                  min_num_genotypes=0)
        assert dists[0] - 1.23732507 < 0.001
コード例 #16
0
    def test_pca(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        do_pca(hdf5)

        varis = VariationsArrays()
        gts = [[[0, 0], [0, 0], [1, 1]],
               [[0, 0], [0, 0], [1, 1]],
               [[0, 0], [0, 0], [1, 1]],
               [[0, 0], [0, 0], [1, 1]],
               ]
        gts = numpy.array(gts)
        varis[GT_FIELD] = gts
        varis.samples = ['a', 'b', 'c']
        res = do_pca(varis)
        projs = res['projections']
        assert projs.shape[0] == gts.shape[1]
        assert numpy.allclose(projs[0], projs[1])
        assert not numpy.allclose(projs[0], projs[2])
コード例 #17
0
    def test_pca(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        do_pca(hdf5)

        varis = VariationsArrays()
        gts = [
            [[0, 0], [0, 0], [1, 1]],
            [[0, 0], [0, 0], [1, 1]],
            [[0, 0], [0, 0], [1, 1]],
            [[0, 0], [0, 0], [1, 1]],
        ]
        gts = numpy.array(gts)
        varis[GT_FIELD] = gts
        varis.samples = ['a', 'b', 'c']
        res = do_pca(varis)
        projs = res['projections']
        assert projs.shape[0] == gts.shape[1]
        assert numpy.allclose(projs[0], projs[1])
        assert not numpy.allclose(projs[0], projs[2])
コード例 #18
0
    def test_samples(self):
        gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]],
                           [[0, 0], [0, 0], [1, 1], [2, 2]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        assert varis.samples == [1, 2, 3, 4]

        # With another file
        tmp_fhand = NamedTemporaryFile()
        path = tmp_fhand.name
        tmp_fhand.close()
        fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, pre_read_max_size=1000)
        h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True)
        h5.put_vars(vcf_parser)
        fhand.close()
        samples = h5.samples
        samples[0] = '0'
        h5.samples = samples
コード例 #19
0
    def test_samples(self):
        gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]],
                           [[0, 0], [0, 0], [1, 1], [2, 2]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        assert varis.samples == [1, 2, 3, 4]

        # With another file
        tmp_fhand = NamedTemporaryFile()
        path = tmp_fhand.name
        tmp_fhand.close()
        fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand)
        h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True)
        h5.put_vars(vcf_parser)
        fhand.close()
        samples = h5.samples
        samples[0] = '0'
        h5.samples = samples
コード例 #20
0
ファイル: test_writers.py プロジェクト: JoseBlanca/variation
    def test_excel(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]],
                           [[2, 2], [2, 0], [2, 1], [0, 0], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]],
                           [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts
        variations.samples = list(range(gts.shape[1]))

        fhand = NamedTemporaryFile(suffix='.xlsx')
        write_excel(variations, fhand)

        # chrom pos
        variations[CHROM_FIELD] = numpy.array([1, 1, 2, 2])
        variations[POS_FIELD] = numpy.array([10, 20, 10, 20])
        fhand = NamedTemporaryFile(suffix='.xlsx')
        write_excel(variations, fhand)

        # REF, ALT
        variations[REF_FIELD] = numpy.array(['A', 'A', 'A', 'A'])
        variations[ALT_FIELD] = numpy.array([['T'], ['T'], ['T'], ['T']])
        write_excel(variations, fhand)
コード例 #21
0
def _filter_samples_by_index(variations,
                             sample_cols,
                             filtered_vars=None,
                             reverse=False):
    if filtered_vars is None:
        filtered_vars = VariationsArrays()

    samples = variations.samples
    try:
        dtype = sample_cols.dtype
        is_bool = numpy.issubdtype(dtype, numpy.dtype(bool))
    except AttributeError:
        item = first(iter(sample_cols))
        is_bool = isinstance(item, bool)
    if not is_bool:
        sample_cols = [idx in sample_cols for idx in range(len(samples))]

    if 'shape' not in dir(sample_cols):
        sample_cols = numpy.array(sample_cols, dtype=numpy.bool)

    if reverse:
        sample_cols = numpy.logical_not(sample_cols)

    for path in variations.keys():
        matrix = variations[path]
        if is_dataset(matrix):
            matrix = matrix[:]
        if 'calls' in path:
            flt_data = matrix[:, sample_cols]
            # flt_data = numpy.compress(sample_cols, , axis=1)
            filtered_vars[path] = flt_data
        else:
            filtered_vars[path] = matrix
    filtered_vars.metadata = variations.metadata
    kept_samples = [
        samples[idx] for idx, keep in enumerate(sample_cols) if keep
    ]
    filtered_vars.samples = kept_samples
    return filtered_vars
コード例 #22
0
    def test_kosman_pairwise_between_pops_by_chunk(self):
        a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1],
                         [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                         [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
        d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
        gts = numpy.stack((a, b, c, d), axis=0)
        gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16)
        variations = VariationsArrays()
        variations.samples = [1, 2, 3, 4]
        variations['/calls/GT'] = gts
        expected = [[0., 0.33333333, 0.75, 0.75], [0.33333333, 0., 0.45, 0.45],
                    [0.75, 0.45, 0., 0.], [0.75, 0.45, 0., 0.]]
        distance = calc_pairwise_distances_between_pops(
            variations,
            chunk_size=None,
            min_num_snps=1,
            pop1_samples=[1, 2, 3, 4],
            pop2_samples=[1, 2, 3, 4])
        assert numpy.allclose(distance, expected)

        expected = [[0., 0.33333333, 0.75, 0.75]]
        distance = calc_pairwise_distances_between_pops(
            variations,
            chunk_size=None,
            min_num_snps=1,
            pop1_samples=[1],
            pop2_samples=[1, 2, 3, 4])
        assert numpy.allclose(distance, expected)

        expected = [[0.75, 0.75], [0.45, 0.45]]
        distance = calc_pairwise_distances_between_pops(variations,
                                                        chunk_size=None,
                                                        min_num_snps=1,
                                                        pop1_samples=[1, 2],
                                                        pop2_samples=[3, 4])
コード例 #23
0
    def test_calc_called_gts_distribution_per_depth(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        dist, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30),
                                                    chunk_size=10)
        assert dist[1, 1] == 1
        dist2, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30),
                                                     chunk_size=None)
        assert numpy.all(dist == dist2)

        vars_ = VariationsArrays()
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [0, 1],
                                           [0, 0], [0, 1], [0, 0],
                                           [0, 0], [0, 1], [1, 1],
                                           [0, 0]]])
        vars_['/calls/DP'] = numpy.array([[10, 5, 15, 7, 10,
                                          0, 0, 25, 20, 10]])
        vars_.samples = list(range(10))
        dist, _ = calc_called_gts_distrib_per_depth(vars_, depths=[0, 5, 10,
                                                                   30])
        expected = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(dist == expected)
コード例 #24
0
    def test_calc_maf_depth_distribs_per_sample(self):
        variations = VariationsArrays()
        variations['/calls/AO'] = numpy.array([])
        variations['/calls/RO'] = numpy.array([])
        distribs, bins = calc_maf_depth_distribs_per_sample(variations,
                                                            chunk_size=None)
        assert distribs is None
        assert bins is None

        variations = VariationsArrays()
        variations['/calls/AO'] = numpy.array([[[0, 0], [0, 0], [15, -1]]])
        variations['/calls/RO'] = numpy.array([[10, 5, 15]])
        variations.samples = list(range(3))
        distribs, _ = calc_maf_depth_distribs_per_sample(variations, n_bins=4,
                                                         min_depth=6,
                                                         chunk_size=None)
        expected = [[0, 0, 0, 1], [0, 0, 0, 0], [0, 0, 1, 0]]
        assert numpy.all(distribs == expected)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        distribs1, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6,
                                                          chunk_size=None)
        distribs2, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6)
        assert numpy.all(distribs1 == distribs2)
コード例 #25
0
ファイル: filters.py プロジェクト: JoseBlanca/variation
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None,
                             reverse=False):
    if filtered_vars is None:
        filtered_vars = VariationsArrays()

    samples = variations.samples
    try:
        dtype = sample_cols.dtype
        is_bool = numpy.issubdtype(dtype, numpy.bool)
    except AttributeError:
        item = first(iter(sample_cols))
        is_bool = isinstance(item, bool)
    if not is_bool:
        sample_cols = [idx in sample_cols for idx in range(len(samples))]

    if 'shape' not in dir(sample_cols):
        sample_cols = numpy.array(sample_cols, dtype=numpy.bool)

    if reverse:
        sample_cols = numpy.logical_not(sample_cols)

    for path in variations.keys():
        matrix = variations[path]
        if is_dataset(matrix):
            matrix = matrix[:]
        if 'calls' in path:
            flt_data = matrix[:, sample_cols]
            # flt_data = numpy.compress(sample_cols, , axis=1)
            filtered_vars[path] = flt_data
        else:
            filtered_vars[path] = matrix
    filtered_vars.metadata = variations.metadata
    kept_samples = [samples[idx] for idx, keep in enumerate(sample_cols)
                    if keep]
    filtered_vars.samples = kept_samples
    return filtered_vars
コード例 #26
0
    def test_calc_called_gts_distribution_per_depth(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        dist, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30),
                                                    chunk_size=10)
        assert dist[1, 1] == 1
        dist2, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30),
                                                     chunk_size=None)
        assert numpy.all(dist == dist2)

        vars_ = VariationsArrays()
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [0, 1],
                                           [0, 0], [0, 1], [0, 0],
                                           [0, 0], [0, 1], [1, 1],
                                           [0, 0]]])
        vars_['/calls/DP'] = numpy.array([[10, 5, 15, 7, 10,
                                          0, 0, 25, 20, 10]])
        vars_.samples = list(range(10))
        dist, _ = calc_called_gts_distrib_per_depth(vars_, depths=[0, 5, 10,
                                                                   30])
        expected = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(dist == expected)
コード例 #27
0
    def test_calc_maf_depth_distribs_per_sample(self):
        variations = VariationsArrays()
        variations['/calls/AO'] = numpy.array([])
        variations['/calls/RO'] = numpy.array([])
        distribs, bins = calc_maf_depth_distribs_per_sample(variations,
                                                            chunk_size=None)
        assert distribs is None
        assert bins is None

        variations = VariationsArrays()
        variations['/calls/AO'] = numpy.array([[[0, 0], [0, 0], [15, -1]]])
        variations['/calls/RO'] = numpy.array([[10, 5, 15]])
        variations.samples = list(range(3))
        distribs, _ = calc_maf_depth_distribs_per_sample(variations, n_bins=4,
                                                         min_depth=6,
                                                         chunk_size=None)
        expected = [[0, 0, 0, 1], [0, 0, 0, 0], [0, 0, 1, 0]]
        assert numpy.all(distribs == expected)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        distribs1, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6,
                                                          chunk_size=None)
        distribs2, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6)
        assert numpy.all(distribs1 == distribs2)
コード例 #28
0
    def test_diploid_writing(self):
        variations = VariationsArrays()
        gts = numpy.array([
            [[0, 0], [2, 2], [1, 1], [0, 0], [0, 0]],
            [[2, 2], [1, 1], [-1, 2], [0, 0], [-1, -1]],
            [[0, 1], [0, 0], [0, 0], [1, 1], [0, 0]],
            [[0, 0], [1, -1], [1, 1], [1, -1], [1, 1]],
            [[0, 1], [0, 1], [-1, -1], [-1, 1], [1, 0]],
            [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
            [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]],
        ])
        ref = numpy.array([b'C', b'G', b'A', b'T', b'T', b'C', b'G'])
        alt = numpy.array([[b'CT', b'CTT'], [b'GA', b'GAT'], [b'C', b''],
                           [b'G', b''], [b'A', b''], [b'G', b''], [b'C', b'']])
        variations[GT_FIELD] = gts
        variations[ALT_FIELD] = alt
        variations[REF_FIELD] = ref
        variations[CHROM_FIELD] = numpy.array(
            ['ch1', 'ch2', 'ch2', 'ch2', 'ch2', 'ch3', 'ch3'])
        variations[POS_FIELD] = numpy.array([10, 20, 30, 40, 50, 10, 15])
        variations.samples = list(map(str, range(gts.shape[1])))

        fhand = io.BytesIO()
        write_fasta(variations,
                    fhand,
                    remove_invariant_snps=True,
                    remove_indels=False,
                    try_to_align_easy_indels=True,
                    write_one_seq_per_sample_setting_hets_to_missing=False)

        # SNPS
        # C-- C-T CTT
        # G-- GA- GAT
        # A C
        # T G
        # T A
        # haps
        # 00 22 11 00 00
        # 22 11 N2 00 NN
        # 01 00 00 11 00
        # 00 1N 11 1N 11
        # 01 01 NN N1 10
        #
        # indi0_h1> C--GATATT
        # indi0_h2> C--GATCTA
        # indi1_h1> CTTGA-ATT
        # indi1_h2> CTTGA-ANA
        # indi2_h1> C-TNNNAGN
        # indi2_h2> C-TGATAGN
        # indi3_h1> C--G--CGN
        # indi3_h2> C--G--CNA
        # indi4_h1> C--NNNAGA
        # indi4_h2> C--NNNAGT
        result = fhand.getvalue().splitlines()
        assert b'>0_hap1' in result[0]
        assert result[1] == b'C--GATATT'
        assert b'>0_hap2' in result[2]
        assert result[3] == b'C--GATCTA'
        assert b'>1_hap1' in result[4]
        assert result[5] == b'CTTGA-AGT'
        assert b'>1_hap2' in result[6]
        assert result[7] == b'CTTGA-ANA'
        assert b'>2_hap1' in result[8]
        assert result[9] == b'C-TNNNAGN'
        assert b'>2_hap2' in result[10]
        assert result[11] == b'C-TGATAGN'
        assert b'>3_hap1' in result[12]
        assert result[13] == b'C--G--CGN'
        assert b'>3_hap2' in result[14]
        assert result[15] == b'C--G--CNA'
        assert b'>4_hap1' in result[16]
        assert result[17] == b'C--NNNAGA'
        assert b'>4_hap2' in result[18]
        assert result[19] == b'C--NNNAGT'
コード例 #29
0
    def test_fasta_writer(self):
        variations = VariationsArrays()
        gts = numpy.array([
            [[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]],
            [[2, 2], [1, 1], [-1, 2], [0, 0], [-1, -1]],
            [[0, 1], [0, 0], [0, 0], [1, 1], [0, 0]],
            [[0, 0], [1, -1], [1, 1], [1, -1], [1, 1]],
            [[0, 1], [0, 1], [-1, -1], [-1, 1], [1, 0]],
            [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
            [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]],
        ])
        ref = numpy.array(['C', 'G', 'A', 'T', 'T', 'C', 'C'])
        alt = numpy.array([['A', 'TT'], ['A', 'T'], ['C', ''], ['G', ''],
                           ['A', 'T'], ['G', ''], ['G', '']])
        variations[GT_FIELD] = gts
        variations[ALT_FIELD] = alt
        variations[REF_FIELD] = ref
        variations[CHROM_FIELD] = numpy.array(
            ['ch1', 'ch2', 'ch2', 'ch2', 'ch2', 'ch3', 'ch3'])
        variations[POS_FIELD] = numpy.array([10, 20, 30, 40, 50, 10, 15])
        variations.samples = list(map(str, range(gts.shape[1])))

        fhand = io.BytesIO()
        write_fasta(variations,
                    fhand,
                    remove_invariant_snps=True,
                    write_one_seq_per_sample_setting_hets_to_missing=True)
        # SNPS
        # C A TT
        # G A T
        # A C
        # T G
        # T A T
        # C G
        # N
        # indi1> TNT
        # indi2> AAN
        # indi3> NAG
        # indi4> GCN
        # indi5> NAG
        result = fhand.getvalue().splitlines()
        assert b'>0' in result[0]
        assert result[1] == b'TNT'
        assert b'>1' in result[2]
        assert result[3] == b'AAN'
        assert b'>2' in result[4]
        assert result[5] == b'NAG'
        assert b'>3' in result[6]
        assert result[7] == b'GCN'
        assert b'>4' in result[8]
        assert result[9] == b'NAG'

        fhand = io.BytesIO()
        write_fasta(variations,
                    fhand,
                    remove_invariant_snps=True,
                    write_one_seq_per_sample_setting_hets_to_missing=True)
        result = fhand.getvalue().splitlines()
        assert b'>0' in result[0]
        assert result[1] == b'TNT'

        fhand = io.BytesIO()
        write_fasta(variations,
                    fhand,
                    remove_sites_all_N=True,
                    write_one_seq_per_sample_setting_hets_to_missing=True)
        result = fhand.getvalue().decode().splitlines()
        assert '>0' in result[0]
        assert result[1] == 'TNTC'
コード例 #30
0
    def test_fasta_writer_with_indels(self):
        variations = VariationsArrays()
        gts = numpy.array([
            [[0, 0], [2, 2], [1, 1], [0, 0], [0, 0]],
            [[2, 2], [1, 1], [-1, 2], [0, 0], [-1, -1]],
            [[0, 1], [0, 0], [0, 0], [1, 1], [0, 0]],
            [[0, 0], [1, -1], [1, 1], [1, -1], [1, 1]],
            [[0, 1], [0, 1], [-1, -1], [-1, 1], [1, 0]],
            [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
            [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]],
        ])
        ref = numpy.array([b'C', b'G', b'A', b'T', b'T', b'C', b'G'])
        alt = numpy.array([[b'CT', b'CTT'], [b'GA', b'GAT'], [b'C', b''],
                           [b'G', b''], [b'A', b'T'], [b'G', b''], [b'C',
                                                                    b'']])
        variations[GT_FIELD] = gts
        variations[ALT_FIELD] = alt
        variations[REF_FIELD] = ref
        variations[CHROM_FIELD] = numpy.array(
            ['ch1', 'ch2', 'ch2', 'ch2', 'ch2', 'ch3', 'ch3'])
        variations[POS_FIELD] = numpy.array([10, 20, 30, 40, 50, 10, 15])
        variations.samples = list(map(str, range(gts.shape[1])))

        fhand = io.BytesIO()
        write_fasta(variations,
                    fhand,
                    remove_invariant_snps=True,
                    remove_indels=False,
                    try_to_align_easy_indels=True,
                    write_one_seq_per_sample_setting_hets_to_missing=True)
        # SNPS
        # C-- C-T CTT
        # G-- GA- GAT
        # A C
        # T G
        # haps
        # 0 2 1 0 0
        # 2 1 H 0 N
        # H 0 0 1 0
        # 0 H 1 H 1
        # indi1> C--GATNT
        # indi2> CTTGA-AN
        # indi3> C-TNNNAG
        # indi4> C--G--CN
        # indi5> C--NNNAG
        result = fhand.getvalue().splitlines()
        assert b'>0' in result[0]
        assert result[1] == b'C--GATNT'
        assert b'>1' in result[2]
        assert result[3] == b'CTTGA-AN'
        assert b'>2' in result[4]
        assert result[5] == b'C-TNNNAG'
        assert b'>3' in result[6]
        assert result[7] == b'C--G--CN'
        assert b'>4' in result[8]
        assert result[9] == b'C--NNNAG'

        fhand = io.BytesIO()
        write_fasta(variations,
                    fhand,
                    remove_invariant_snps=True,
                    remove_indels=False,
                    put_hyphens_in_indels=False,
                    write_one_seq_per_sample_setting_hets_to_missing=True)
        result = fhand.getvalue().splitlines()
        assert b'>0' in result[0]
        assert result[1] == b'CGATNT'
        assert b'>1' in result[2]
        assert result[3] == b'CTTGAAN'
        assert b'>2' in result[4]
        assert result[5] == b'CTNNNAG'
        assert b'>3' in result[6]
        assert result[7] == b'CGCN'
        assert b'>4' in result[8]
        assert result[9] == b'CNNNAG'