Example #1
0
    def test_calc_gt_type_stats(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        result = calc_gt_type_stats(hdf5)
        assert result.shape == (4, 153)
        assert numpy.all(numpy.sum(result, axis=0) == 943)

        gts = numpy.array([[[0, 0], [1, 1], [0, -1], [-1, -1]],
                           [[0, -1], [0, 0], [0, -1], [-1, -1]],
                           [[0, 1], [0, 0], [0, 0], [-1, -1]]])

        varis = {'/calls/GT': gts}
        res = calc_gt_type_stats(varis)
        expected = [[1, 2, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 2, 3]]
        assert numpy.all(res == expected)
    def test_calc_gt_type_stats(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        result = calc_gt_type_stats(hdf5)
        assert result.shape == (4, 153)
        assert numpy.all(numpy.sum(result, axis=0) == 943)

        gts = numpy.array([[[0, 0], [1, 1], [0, -1], [-1, -1]],
                           [[0, -1], [0, 0], [0, -1], [-1, -1]],
                           [[0, 1], [0, 0], [0, 0], [-1, -1]]])

        varis = {'/calls/GT': gts}
        res = calc_gt_type_stats(varis)
        expected = [[1, 2, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 2, 3]]
        assert numpy.all(res == expected)
def plot_gt_stats_per_sample(variations, data_dir, chunk_size=SNPS_PER_CHUNK):
    gt_stats = calc_gt_type_stats(variations, chunk_size=chunk_size)
    gt_stats = gt_stats.transpose()
    figsize = (variations[GT_FIELD].shape[1], 7)
    
    # All genotypes classes per sample
    fpath = join(data_dir, 'genotype_counts_per_sample.png')
    title = 'Genotypes counts per sample'
    mpl_params = {'set_xlabel': {'args': ['Samples'], 'kwargs': {}},
                  'set_ylabel': {'args': ['Number of GTs'], 'kwargs': {}},
                  'set_title': {'args': [title], 'kwargs': {}}}
    samples = variations.samples
    if samples is not None:
        mpl_params['set_xticklabels'] = {'args': [samples], 'kwargs': {}}
    plot_barplot(gt_stats, ['Ref Homozygous', 'Heterozygous', 'Alt Homozygous',
                            'Missing GT'], mpl_params=mpl_params, 
                 color=['darkslategrey', 'c', 'paleturquoise', 'cadetblue'],
                 fpath=fpath, stacked=True, figsize=figsize)

    # Missing per sample
    fpath = join(data_dir, 'missing_per_sample.png')
    title = 'Missing genotypes counts per sample'
    mpl_params['set_ylabel'] = {'args': ['Missing Genotypes Number'], 'kwargs': {}}
    mpl_params['set_title'] = {'args': [title], 'kwargs': {}}
    plot_barplot(gt_stats[:, -1], ['Missing GT'], mpl_params=mpl_params,
                 fpath=fpath, stacked=True, figsize=figsize)

    # Heterozygous per sample
    fpath = join(data_dir, 'het_per_sample.png')
    title = 'Heterozygous counts per sample'
    mpl_params['set_ylabel'] = {'args': ['Heterozygous Number'], 'kwargs': {}}
    mpl_params['set_title'] = {'args': [title], 'kwargs': {}}
    plot_barplot(gt_stats[:, 1], ['Heterozygous'], mpl_params=mpl_params,
                 fpath=fpath, stacked=True, figsize=figsize)

    # GT percentage without missing values
    fpath = join(data_dir, 'gt_perc_per_sample.png')
    title = 'Genotypes percentage per sample'
    mpl_params['set_ylabel'] = {'args': ['% Genotypes'], 'kwargs': {}}
    mpl_params['set_title'] = {'args': [title], 'kwargs': {}}
    gt_perc = gt_stats[:, :-1] / gt_stats[:, :-1].sum(axis=1, keepdims=True)
    gt_perc *= 100
    plot_barplot(gt_perc, ['Ref Homozygous', 'Heterozygous', 'Alt Homozygous'],
                 mpl_params=mpl_params, fpath=fpath, figsize=figsize)
Example #4
0
def plot_gt_stats_per_sample(variations, data_dir, chunk_size=SNPS_PER_CHUNK):
    gt_stats = calc_gt_type_stats(variations, chunk_size=chunk_size)
    gt_stats = gt_stats.transpose()
    figsize = (variations[GT_FIELD].shape[1], 7)

    # All genotypes classes per sample
    fpath = join(data_dir, 'genotype_counts_per_sample.png')
    title = 'Genotypes counts per sample'
    mpl_params = {
        'set_xlabel': {
            'args': ['Samples'],
            'kwargs': {}
        },
        'set_ylabel': {
            'args': ['Number of GTs'],
            'kwargs': {}
        },
        'set_title': {
            'args': [title],
            'kwargs': {}
        }
    }
    samples = variations.samples
    if samples is not None:
        mpl_params['set_xticklabels'] = {'args': [samples], 'kwargs': {}}
    plot_barplot(
        gt_stats,
        ['Ref Homozygous', 'Heterozygous', 'Alt Homozygous', 'Missing GT'],
        mpl_params=mpl_params,
        color=['darkslategrey', 'c', 'paleturquoise', 'cadetblue'],
        fpath=fpath,
        stacked=True,
        figsize=figsize)

    # Missing per sample
    fpath = join(data_dir, 'missing_per_sample.png')
    title = 'Missing genotypes counts per sample'
    mpl_params['set_ylabel'] = {
        'args': ['Missing Genotypes Number'],
        'kwargs': {}
    }
    mpl_params['set_title'] = {'args': [title], 'kwargs': {}}
    plot_barplot(gt_stats[:, -1], ['Missing GT'],
                 mpl_params=mpl_params,
                 fpath=fpath,
                 stacked=True,
                 figsize=figsize)

    # Heterozygous per sample
    fpath = join(data_dir, 'het_per_sample.png')
    title = 'Heterozygous counts per sample'
    mpl_params['set_ylabel'] = {'args': ['Heterozygous Number'], 'kwargs': {}}
    mpl_params['set_title'] = {'args': [title], 'kwargs': {}}
    plot_barplot(gt_stats[:, 1], ['Heterozygous'],
                 mpl_params=mpl_params,
                 fpath=fpath,
                 stacked=True,
                 figsize=figsize)

    # GT percentage without missing values
    fpath = join(data_dir, 'gt_perc_per_sample.png')
    title = 'Genotypes percentage per sample'
    mpl_params['set_ylabel'] = {'args': ['% Genotypes'], 'kwargs': {}}
    mpl_params['set_title'] = {'args': [title], 'kwargs': {}}
    gt_perc = gt_stats[:, :-1] / gt_stats[:, :-1].sum(axis=1, keepdims=True)
    gt_perc *= 100
    plot_barplot(gt_perc, ['Ref Homozygous', 'Heterozygous', 'Alt Homozygous'],
                 mpl_params=mpl_params,
                 fpath=fpath,
                 figsize=figsize)