Esempio n. 1
0
    def test_concordance(self):
        dataset = get_dataset()
        glob_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset)

        self.assertEqual(sum([sum(glob_conc[i]) for i in range(5)]), dataset.count_rows() * dataset.count_cols())

        counts = dataset.aggregate_entries(hl.Struct(n_het=agg.filter(dataset.GT.is_het(), agg.count()),
                                                     n_hom_ref=agg.filter(dataset.GT.is_hom_ref(),
                                                                          agg.count()),
                                                     n_hom_var=agg.filter(dataset.GT.is_hom_var(),
                                                                          agg.count()),
                                                     nNoCall=agg.filter(hl.is_missing(dataset.GT),
                                                                        agg.count())))

        self.assertEqual(glob_conc[0][0], 0)
        self.assertEqual(glob_conc[1][1], counts.nNoCall)
        self.assertEqual(glob_conc[2][2], counts.n_hom_ref)
        self.assertEqual(glob_conc[3][3], counts.n_het)
        self.assertEqual(glob_conc[4][4], counts.n_hom_var)
        [self.assertEqual(glob_conc[i][j], 0) for i in range(5) for j in range(5) if i != j]

        self.assertTrue(cols_conc.all(hl.sum(hl.flatten(cols_conc.concordance)) == dataset.count_rows()))
        self.assertTrue(rows_conc.all(hl.sum(hl.flatten(rows_conc.concordance)) == dataset.count_cols()))

        cols_conc.write('/tmp/foo.kt', overwrite=True)
        rows_conc.write('/tmp/foo.kt', overwrite=True)
Esempio n. 2
0
    def test_concordance(self):
        dataset = get_dataset()
        glob_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset)

        self.assertEqual(sum([sum(glob_conc[i]) for i in range(5)]), dataset.count_rows() * dataset.count_cols())

        counts = dataset.aggregate_entries(hl.Struct(n_het=agg.filter(dataset.GT.is_het(), agg.count()),
                                                     n_hom_ref=agg.filter(dataset.GT.is_hom_ref(),
                                                                          agg.count()),
                                                     n_hom_var=agg.filter(dataset.GT.is_hom_var(),
                                                                          agg.count()),
                                                     nNoCall=agg.filter(hl.is_missing(dataset.GT),
                                                                        agg.count())))

        self.assertEqual(glob_conc[0][0], 0)
        self.assertEqual(glob_conc[1][1], counts.nNoCall)
        self.assertEqual(glob_conc[2][2], counts.n_hom_ref)
        self.assertEqual(glob_conc[3][3], counts.n_het)
        self.assertEqual(glob_conc[4][4], counts.n_hom_var)
        [self.assertEqual(glob_conc[i][j], 0) for i in range(5) for j in range(5) if i != j]

        self.assertTrue(cols_conc.all(hl.sum(hl.flatten(cols_conc.concordance)) == dataset.count_rows()))
        self.assertTrue(rows_conc.all(hl.sum(hl.flatten(rows_conc.concordance)) == dataset.count_cols()))

        cols_conc.write('/tmp/foo.kt', overwrite=True)
        rows_conc.write('/tmp/foo.kt', overwrite=True)
Esempio n. 3
0
def compute_concordance(mt: hl.MatrixTable, other_mt: hl.MatrixTable,
                        name: str) -> Tuple[hl.Table, hl.Table]:
    # Filter to sites present in mt samples
    mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))
    other_mt = other_mt.filter_rows(hl.agg.any(other_mt.GT.is_non_ref()))

    summary, sample_concordance_ht, sites_concordance_ht = hl.concordance(
        mt, other_mt)
    logger.info(f'{name} concordance summary: %s', pformat(summary))

    return sample_concordance_ht, sites_concordance_ht
def concordance_tables(full_vcf, downsample_dict, output, overwrite):
    """
    runs concordance between full vcf and downsampled vcf
    :param full_vcf:
    :param downsample_dict:
    :param output:
    :param overwrite:
    :return:
    """
    global_conc, cols_conc, rows_conc = hl.concordance(full_vcf,
                                                       downsample_dict)
    pprint(global_conc)
    cols_conc.write(output + 'samples.ht', overwrite=overwrite)
    rows_conc.write(output + 'variants.ht', overwrite=overwrite)
Esempio n. 5
0
 def test_concordance_no_values_doesnt_error(self):
     dataset = get_dataset().filter_rows(False)
     _, cols_conc, rows_conc = hl.concordance(dataset, dataset)
     cols_conc._force_count()
     rows_conc._force_count()
Esempio n. 6
0
    def test_concordance_n_discordant(self):
        dataset = get_dataset()
        _, cols_conc, rows_conc = hl.concordance(dataset, dataset)
        assert cols_conc.aggregate(
            hl.agg.count_where(cols_conc.n_discordant != 0)) == 0

        rows1 = [
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '1',
                    'GT': hl.Call([0, 0])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '2',
                    'GT': hl.Call([0, 0])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '3',
                    'GT': hl.Call([1, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '4',
                    'GT': hl.Call([1, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 101),
                    'alleles': ['A', 'T'],
                    's': '1',
                    'GT': hl.Call([1, 1])
                }),
        ]
        rows2 = [
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '1',
                    'GT': None
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '2',
                    'GT': hl.Call([0, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '3',
                    'GT': hl.Call([0, 1])
                }),
            hl.Struct(
                **{
                    'locus': hl.Locus('1', 100),
                    'alleles': ['A', 'T'],
                    's': '4',
                    'GT': hl.Call([1, 1])
                }),
        ]

        def make_mt(rows):
            ht = hl.Table.parallelize(
                rows,
                schema=
                'struct{locus:locus<GRCh37>,alleles:array<str>,s:str,GT:call}')
            return ht.to_matrix_table(row_key=['locus', 'alleles'],
                                      col_key=['s'])

        global_conc_2, cols_conc_2, rows_conc_2 = hl.concordance(
            make_mt(rows1), make_mt(rows2))
        assert cols_conc_2.collect() == [
            hl.Struct(s='1',
                      concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 1, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [1, 0, 0, 0, 0]],
                      n_discordant=0),
            hl.Struct(s='2',
                      concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0]],
                      n_discordant=1),
            hl.Struct(s='3',
                      concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 0]],
                      n_discordant=1),
            hl.Struct(s='4',
                      concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 1]],
                      n_discordant=0),
        ]

        assert global_conc_2 == [[3, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                 [0, 1, 0, 1, 0], [0, 0, 0, 0, 0],
                                 [1, 0, 0, 1, 1]]
        assert rows_conc_2.collect() == [
            hl.Struct(locus=hl.Locus('1', 100),
                      alleles=['A', 'T'],
                      concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 1, 0, 1, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 1, 1]],
                      n_discordant=2),
            hl.Struct(locus=hl.Locus('1', 101),
                      alleles=['A', 'T'],
                      concordance=[[3, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                                   [1, 0, 0, 0, 0]],
                      n_discordant=0),
        ]
Esempio n. 7
0
pipeline_ds.describe()


autosomes_interval_expressions = []
for chrom in range(1,23):
    interval_expr = hl.parse_locus_interval('chr' + str(chrom), reference_genome=reference)
    autosomes_interval_expressions.append(interval_expr)

giab_auts_ds = hl.filter_intervals(giab_ds, autosomes_interval_expressions)
pipeline_auts_ds = hl.filter_intervals(pipeline_ds, autosomes_interval_expressions)

giab_auts_ds = giab_auts_ds.annotate_entries(GT = hl.call(giab_auts_ds.GT[0], giab_auts_ds.GT[1], phased=False))
pipeline_auts_ds = pipeline_auts_ds.annotate_entries(GT = hl.call(pipeline_auts_ds.GT[0], pipeline_auts_ds.GT[1], phased=False))

# Run genotype concordance
global_conc, cols_conc, rows_conc = hl.concordance(giab_auts_ds, pipeline_auts_ds)

summary = global_conc

left_homref_right_homvar = summary[2][4]
left_het_right_missing = summary[3][1]
left_het_right_something_else = sum(summary[3][:]) - summary[3][3]
total_concordant = summary[2][2] + summary[3][3] + summary[4][4]
total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - total_concordant

concordance = total_concordant/float(total_concordant + total_discordant)

now = datetime.datetime.utcnow()

results_bucket = "{}/validation/{}/validation-result-{}{}{}.txt".format(bucket_name, cohort_prefix, now.month, now.day, now.year)