Beispiel #1
0
    def test_annotate_intervals(self):
        ds = get_dataset()

        bed1 = hl.import_bed(resource('example1.bed'), reference_genome='GRCh37')
        bed2 = hl.import_bed(resource('example2.bed'), reference_genome='GRCh37')
        bed3 = hl.import_bed(resource('example3.bed'), reference_genome='GRCh37')
        self.assertTrue(list(bed2.key.dtype) == ['interval'])
        self.assertTrue(list(bed2.row.dtype) == ['interval', 'target'])

        interval_list1 = hl.import_locus_intervals(resource('exampleAnnotation1.interval_list'))
        interval_list2 = hl.import_locus_intervals(resource('exampleAnnotation2.interval_list'))
        self.assertTrue(list(interval_list2.key.dtype) == ['interval'])
        self.assertTrue(list(interval_list2.row.dtype) == ['interval', 'target'])

        ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows()
        self.assertTrue(ann.all((ann.locus.position <= 14000000) |
                                (ann.locus.position >= 17000000) |
                                (hl.is_missing(ann.in_interval))))

        for bed in [bed2, bed3]:
            ann = ds.annotate_rows(target=bed[ds.locus].target).rows()
            expr = (hl.case()
                    .when(ann.locus.position <= 14000000, ann.target == 'gene1')
                    .when(ann.locus.position >= 17000000, ann.target == 'gene2')
                    .default(ann.target == hl.null(hl.tstr)))
            self.assertTrue(ann.all(expr))

        self.assertTrue(ds.annotate_rows(in_interval=interval_list1[ds.locus]).rows()
                        ._same(ds.annotate_rows(in_interval=bed1[ds.locus]).rows()))

        self.assertTrue(ds.annotate_rows(target=interval_list2[ds.locus].target).rows()
                        ._same(ds.annotate_rows(target=bed2[ds.locus].target).rows()))
Beispiel #2
0
    def test_import_locus_intervals_badly_defined_intervals(self):
        interval_file = resource('example3.interval_list')
        t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37', skip_invalid_intervals=True)
        self.assertTrue(t.count() == 21)

        t = hl.import_locus_intervals(interval_file, reference_genome=None, skip_invalid_intervals=True)
        self.assertTrue(t.count() == 22)
Beispiel #3
0
def annotate_in_segdup(mt, genome_version="GRCh38"):

    if genome_version == "GRCh37":
        segdup_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh37/GRCh37GenomicSuperDup.bed", reference_genome="GRCh37")
    elif genome_version == "GRCh38":
        segdup_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh38/GRCh38GenomicSuperDup.without_decoys.bed", reference_genome="GRCh38")
    else:
        raise ValueError(f"Invalid genome version: {genome_version}")

    return mt.annotate_rows(info = mt.info.annotate(
        in_segdup=hl.is_defined(segdup_regions[mt.locus])))
Beispiel #4
0
def annotate_in_LCR(mt, genome_version="GRCh38"):

    if genome_version == "GRCh37":
        lcr_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh37/grch37_LCRs_without_decoys.bed", reference_genome="GRCh37")
    elif genome_version == "GRCh38":
        lcr_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh38/grch38_LCRs_without_decoys.bed", reference_genome="GRCh38")
    else:
        raise ValueError(f"Invalid genome version: {genome_version}")

    return mt.annotate_rows(info = mt.info.annotate(
        in_LCR=hl.is_defined(lcr_regions[mt.locus])))
Beispiel #5
0
    def test_import_locus_intervals_badly_defined_intervals(self):
        interval_file = resource('example3.interval_list')
        t = hl.import_locus_intervals(interval_file,
                                      reference_genome='GRCh37',
                                      skip_invalid_intervals=True)
        self.assertTrue(t.count() == 21)

        t = hl.import_locus_intervals(interval_file,
                                      reference_genome=None,
                                      skip_invalid_intervals=True)
        self.assertTrue(t.count() == 22)
def run_platform_imputation(
    mt: hl.MatrixTable,
    plat_min_cluster_size: int,
    plat_min_sample_size: int,
    plat_assignment_pcs: int,
) -> hl.Table:
    """
    Run PCA using sample callrate across Broad's evaluation intervals and create Hail Table
    with platform PCs and assigned platform.
    :param MatrixTable mt: QC MatrixTable
    :param plat_min_cluster_size: min cluster size for HBDscan clustering
    :param plat_min_sample_size: min sample size for HBDscan clustering
    :param plat_assignment_pcs: Number PCs used for HBDscan clustering
    :return: Table with platform PCs and assigned platform
    :rtype: Table
    """
    intervals = hl.import_locus_intervals(
        "gs://gcp-public-data--broad-references/hg38/v0/exome_evaluation_regions.v1.interval_list"
    )
    callrate_mt = compute_callrate_mt(mt, intervals)
    eigenvalues, scores_ht, ignore = run_platform_pca(callrate_mt)
    plat_ht = assign_platform_from_pcs(
        scores_ht,
        hdbscan_min_cluster_size=plat_min_cluster_size,
        hdbscan_min_samples=plat_min_sample_size,
    )
    plat_pcs = {
        f"plat_PC{i+1}": scores_ht.scores[i]
        for i in list(range(0, plat_assignment_pcs))
    }
    scores_ht = scores_ht.annotate(**plat_pcs).drop("scores")
    plat_ht = plat_ht.annotate(**scores_ht[plat_ht.key])
    return plat_ht
Beispiel #7
0
def run_pipeline(args):
    hl.init(log='./hail_annotation_pipeline.log')
    '''
	rg = hl.get_reference('GRCh37')
	grch37_contigs = [x for x in rg.contigs if not x.startswith('GL') and not x.startswith('M')]
	contig_dict = dict(zip(grch37_contigs, ['chr'+x for x in grch37_contigs]))
	'''

    exome_intervals = hl.import_locus_intervals(
        '/gpfs/ycga/project/lek/shared/resources/hg38/exome_evaluation_regions.v1.interval_list',
        reference_genome='GRCh38')

    #mt = hl.import_vcf(args.vcf,reference_genome='GRCh38',contig_recoding=contig_dict,array_elements_required=False,force_bgz=True,filter='MONOALLELIC')
    mt = hl.import_vcf(args.vcf,
                       reference_genome='GRCh38',
                       array_elements_required=False,
                       force_bgz=True,
                       filter='MONOALLELIC')

    mt = mt.filter_rows(hl.is_defined(exome_intervals[mt.locus]))

    pprint.pprint(mt.describe())
    pprint.pprint(mt.show())

    mt = mt.repartition(hl.eval(hl.int(mt.n_partitions() / 10)))

    mt.write(args.out, overwrite=True)
Beispiel #8
0
def filter_out_segdups(mt, genome_version="GRCh38"):

    if genome_version == "GRCh38":
        segdup_regions = hl.import_locus_intervals("gs://broad-dsp-spec-ops/scratch/weisburd/ref/GRCh38/GRCh38GenomicSuperDup.without_decoys.bed", reference_genome="GRCh38")
    else:
        raise ValueError(f"Invalid genome version: {genome_version}")

    return mt.filter_rows(hl.is_missing(segdup_regions[mt.locus]))
Beispiel #9
0
    def test_annotate_intervals(self):
        ds = get_dataset()

        bed1 = hl.import_bed(resource('example1.bed'),
                             reference_genome='GRCh37')
        bed2 = hl.import_bed(resource('example2.bed'),
                             reference_genome='GRCh37')
        bed3 = hl.import_bed(resource('example3.bed'),
                             reference_genome='GRCh37')
        self.assertTrue(list(bed2.key.dtype) == ['interval'])
        self.assertTrue(list(bed2.row.dtype) == ['interval', 'target'])

        interval_list1 = hl.import_locus_intervals(
            resource('exampleAnnotation1.interval_list'))
        interval_list2 = hl.import_locus_intervals(
            resource('exampleAnnotation2.interval_list'))
        self.assertTrue(list(interval_list2.key.dtype) == ['interval'])
        self.assertTrue(
            list(interval_list2.row.dtype) == ['interval', 'target'])

        ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows()
        self.assertTrue(
            ann.all((ann.locus.position <= 14000000)
                    | (ann.locus.position >= 17000000)
                    | (hl.is_missing(ann.in_interval))))

        for bed in [bed2, bed3]:
            ann = ds.annotate_rows(target=bed[ds.locus].target).rows()
            expr = (hl.case().when(ann.locus.position <= 14000000,
                                   ann.target == 'gene1').when(
                                       ann.locus.position >= 17000000,
                                       ann.target == 'gene2').default(
                                           ann.target == hl.null(hl.tstr)))
            self.assertTrue(ann.all(expr))

        self.assertTrue(
            ds.annotate_rows(
                in_interval=interval_list1[ds.locus]).rows()._same(
                    ds.annotate_rows(in_interval=bed1[ds.locus]).rows()))

        self.assertTrue(
            ds.annotate_rows(
                target=interval_list2[ds.locus].target).rows()._same(
                    ds.annotate_rows(target=bed2[ds.locus].target).rows()))
Beispiel #10
0
def get_lcr_ht(overwrite: bool = False) -> hl.Table:
    lcr_interval = hl.import_locus_intervals(
        f'{nfs_dir}/resources/grch38/LCRFromHengHg38.txt.bgz',
        skip_invalid_intervals=True,
        min_partitions=50,
        reference_genome='GRCh38')
    return lcr_interval.checkpoint(
        f'{nfs_dir}/resources/grch38/LCRFromHengHg38.ht',
        overwrite=overwrite,
        _read_if_exists=not overwrite)
Beispiel #11
0
def filter_low_conf_regions(
    mt: Union[hl.MatrixTable, hl.Table],
    filter_lcr: bool = True,
    filter_decoy: bool = True,
    filter_segdup: bool = True,
    filter_exome_low_coverage_regions: bool = False,
    high_conf_regions: Optional[List[str]] = None,
) -> Union[hl.MatrixTable, hl.Table]:
    """
    Filters low-confidence regions

    :param mt: MatrixTable or Table to filter
    :param filter_lcr: Whether to filter LCR regions
    :param filter_decoy: Whether to filter decoy regions
    :param filter_segdup: Whether to filter Segdup regions
    :param filter_exome_low_coverage_regions: Whether to filter exome low confidence regions
    :param high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions)
    :return: MatrixTable or Table with low confidence regions removed
    """
    build = get_reference_genome(mt.locus).name
    if build == "GRCh37":
        import gnomad.resources.grch37.reference_data as resources
    elif build == "GRCh38":
        import gnomad.resources.grch38.reference_data as resources

    criteria = []
    if filter_lcr:
        lcr = resources.lcr_intervals.ht()
        criteria.append(hl.is_missing(lcr[mt.locus]))

    if filter_decoy:
        decoy = resources.decoy_intervals.ht()
        criteria.append(hl.is_missing(decoy[mt.locus]))

    if filter_segdup:
        segdup = resources.seg_dup_intervals.ht()
        criteria.append(hl.is_missing(segdup[mt.locus]))

    if filter_exome_low_coverage_regions:
        high_cov = resources.high_coverage_intervals.ht()
        criteria.append(hl.is_missing(high_cov[mt.locus]))

    if high_conf_regions is not None:
        for region in high_conf_regions:
            region = hl.import_locus_intervals(region)
            criteria.append(hl.is_defined(region[mt.locus]))

    if criteria:
        filter_criteria = functools.reduce(operator.iand, criteria)
        if isinstance(mt, hl.MatrixTable):
            mt = mt.filter_rows(filter_criteria)
        else:
            mt = mt.filter(filter_criteria)

    return mt
Beispiel #12
0
    def test_import_locus_intervals(self):
        interval_file = resource('annotinterall.interval_list')
        intervals = hl.import_locus_intervals(interval_file, reference_genome='GRCh37')
        nint = intervals.count()

        i = 0
        with open(interval_file) as f:
            for line in f:
                if len(line.strip()) != 0:
                    i += 1
        self.assertEqual(nint, i)

        self.assertEqual(intervals.interval.dtype.point_type, hl.tlocus('GRCh37'))
Beispiel #13
0
def _import_purcell_5k(path) -> hl.Table:
    p5k = hl.import_locus_intervals(path, reference_genome='GRCh37')
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    if not rg37.has_liftover('GRCh38'):
        rg37.add_liftover(
            'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)
    p5k = p5k.annotate(start=hl.liftover(p5k.interval.start, 'GRCh38'),
                       end=hl.liftover(p5k.interval.start, 'GRCh38'))
    p5k = p5k.filter((p5k.start.contig == 'chr' + p5k.interval.start.contig)
                     & (p5k.end.contig == 'chr' + p5k.interval.end.contig))
    p5k = p5k.key_by()
    p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start)
    return p5k.key_by('locus')
Beispiel #14
0
def filter_low_conf_regions(
        mt: hl.MatrixTable,
        filter_lcr: bool = True,
        filter_decoy: bool = True,
        filter_segdup: bool = True,
        high_conf_regions: Optional[List[str]] = None) -> hl.MatrixTable:
    """
    Filters low-confidence regions

    :param MatrixTable mt: MT to filter
    :param bool filter_lcr: Whether to filter LCR regions
    :param bool filter_decoy: Whether to filter decoy regions
    :param bool filter_segdup: Whether to filter Segdup regions
    :param list of str high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions)
    :return: MT with low confidence regions removed
    :rtype: MatrixTable
    """
    from gnomad_hail.resources import lcr_intervals_path, decoy_intervals_path, segdup_intervals_path

    if filter_lcr:
        lcr = hl.import_locus_intervals(lcr_intervals_path)
        mt = mt.filter_rows(hl.is_defined(lcr[mt.locus]), keep=False)

    if filter_decoy:
        decoy = hl.import_bed(decoy_intervals_path)
        mt = mt.filter_rows(hl.is_defined(decoy[mt.locus]), keep=False)

    if filter_segdup:
        segdup = hl.import_bed(segdup_intervals_path)
        mt = mt.filter_rows(hl.is_defined(segdup[mt.locus]), keep=False)

    if high_conf_regions is not None:
        for region in high_conf_regions:
            region = hl.import_locus_intervals(region)
            mt = mt.filter_rows(hl.is_defined(region[mt.locus]), keep=True)

    return mt
Beispiel #15
0
    def test_import_locus_intervals(self):
        interval_file = resource('annotinterall.interval_list')
        t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37')
        nint = t.count()

        i = 0
        with open(interval_file) as f:
            for line in f:
                if len(line.strip()) != 0:
                    i += 1
        self.assertEqual(nint, i)
        self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37'))

        tmp_file = new_temp_file(prefix="test", suffix="interval_list")
        start = t.interval.start
        end = t.interval.end
        (t
         .key_by(interval=hl.locus_interval(start.contig, start.position, end.position, True, True))
         .select()
         .export(tmp_file, header=False))

        t2 = hl.import_locus_intervals(tmp_file)

        self.assertTrue(t.select()._same(t2))
Beispiel #16
0
def _import_purcell_5k(path) -> hl.Table:
    p5k = hl.import_locus_intervals(path, reference_genome="GRCh37")
    rg37 = hl.get_reference("GRCh37")
    rg38 = hl.get_reference("GRCh38")
    if not rg37.has_liftover("GRCh38"):
        rg37.add_liftover("references/grch37_to_grch38.over.chain.gz", rg38)
    p5k = p5k.annotate(
        start=hl.liftover(p5k.interval.start, "GRCh38"),
        end=hl.liftover(p5k.interval.start, "GRCh38"),
    )
    p5k = p5k.filter((p5k.start.contig == "chr" + p5k.interval.start.contig)
                     & (p5k.end.contig == "chr" + p5k.interval.end.contig))
    p5k = p5k.key_by()
    p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start)
    return p5k.key_by("locus")
Beispiel #17
0
    def test_import_locus_intervals(self):
        interval_file = resource('annotinterall.interval_list')
        t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37')
        nint = t.count()

        i = 0
        with open(interval_file) as f:
            for line in f:
                if len(line.strip()) != 0:
                    i += 1
        self.assertEqual(nint, i)
        self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37'))

        tmp_file = new_temp_file(prefix="test", suffix="interval_list")
        start = t.interval.start
        end = t.interval.end
        (t.key_by(
            interval=hl.locus_interval(start.contig, start.position, end.
                                       position, True, True)).select().export(
                                           tmp_file, header=False))

        t2 = hl.import_locus_intervals(tmp_file)

        self.assertTrue(t.select()._same(t2))
Beispiel #18
0
    def test_skat(self):
        ds2 = hl.import_vcf(resource('sample2.vcf'))

        covariatesSkat = (hl.import_table(resource("skat.cov"), impute=True)
            .key_by("Sample"))

        phenotypesSkat = (hl.import_table(resource("skat.pheno"),
                                          types={"Pheno": hl.tfloat64},
                                          missing="0")
            .key_by("Sample"))

        intervalsSkat = (hl.import_locus_intervals(resource("skat.interval_list")))

        weightsSkat = (hl.import_table(resource("skat.weights"),
                                       types={"locus": hl.tlocus(),
                                              "weight": hl.tfloat64})
            .key_by("locus"))

        ds = hl.split_multi_hts(ds2)
        ds = ds.annotate_rows(gene=intervalsSkat[ds.locus],
                              weight=weightsSkat[ds.locus].weight)
        ds = ds.annotate_cols(pheno=phenotypesSkat[ds.s].Pheno,
                              cov=covariatesSkat[ds.s])
        ds = ds.annotate_cols(pheno=hl.cond(ds.pheno == 1.0,
                                            False,
                                            hl.cond(ds.pheno == 2.0,
                                                    True,
                                                    hl.null(hl.tbool))))

        hl.skat(ds,
                key_expr=ds.gene,
                weight_expr=ds.weight,
                y=ds.pheno,
                x=ds.GT.n_alt_alleles(),
                covariates=[ds.cov.Cov1, ds.cov.Cov2],
                logistic=False).count()

        hl.skat(ds,
                key_expr=ds.gene,
                weight_expr=ds.weight,
                y=ds.pheno,
                x=hl.pl_dosage(ds.PL),
                covariates=[ds.cov.Cov1, ds.cov.Cov2],
                logistic=True).count()
def main(args):
    hl.init(log='/platform_pca.log')

    if not args.skip_prepare_data_for_platform_pca:
        # ~1 hour on 800 cores (3/8/18)
        logger.info('Preparing data for platform PCA...')
        mt = get_gnomad_data('exomes', adj=True, raw=False, meta_root=None, fam_root=None, split=False)
        mt = filter_to_autosomes(mt)
        intervals = hl.import_locus_intervals(evaluation_intervals_path)
        mt = mt.annotate_rows(interval=intervals[mt.locus].target)
        mt = mt.filter_rows(hl.is_defined(mt.interval) & (hl.len(mt.alleles) == 2))
        mt = mt.select_entries(GT=hl.or_missing(hl.is_defined(mt.GT), hl.struct()))
        callrate_mt = mt.group_rows_by(mt.interval).aggregate(callrate=hl.agg.fraction(hl.is_defined(mt.GT)))
        callrate_mt.write(exome_callrate_mt_path, args.overwrite)

    if not args.skip_run_platform_pca:
        logger.info('Running platform PCA...')
        qc_ht = hl.read_table(qc_ht_path('exomes', 'hard_filters')).key_by('s')
        callrate_mt = hl.read_matrix_table(exome_callrate_mt_path)
        callrate_mt = callrate_mt.filter_cols(hl.len(qc_ht[callrate_mt.col_key].hard_filters) == 0)
        callrate_mt = callrate_mt.annotate_entries(callrate=hl.int(callrate_mt.callrate > 0.25))
        # Center until Hail's PCA does it for you
        callrate_mt = callrate_mt.annotate_rows(mean_callrate=hl.agg.mean(callrate_mt.callrate))
        callrate_mt = callrate_mt.annotate_entries(callrate=callrate_mt.callrate - callrate_mt.mean_callrate)
        eigenvalues, scores, _ = hl.pca(callrate_mt.callrate, compute_loadings=False)
        logger.info('Eigenvalues: {}'.format(eigenvalues))
        # [731282566.2824697, 78687228.90071851, 43837650.51729764, 33969298.61827205, 26308703.539534636, 21102437.512725923, 16949828.555817757, 12994894.187041137, 8372332.274295175, 8128326.814388647]
        scores.write(exome_callrate_scores_ht_path)

    logger.info('Annotating with platform PCs and known platform annotations...')
    scores = hl.read_table(exome_callrate_scores_ht_path).annotate(data_type='exomes')
    if args.pc_scores_in_separate_fields:
        scores = scores.transmute(scores=[
            scores[ann] for ann in sorted(
                [ann for ann in scores.row if ann.startswith("PC")],
                key=lambda x: int(x[2:])
            )
        ])
    platform_pcs = assign_platform_pcs(scores)
    platform_pcs.write(qc_ht_path('exomes', 'platforms'), overwrite=args.overwrite)
Beispiel #20
0
def write_truth_concordance(data_type: str, truth_sample: str,
                            overwrite: bool) -> None:
    sample_mapping = {
        'NA12878': {
            'exomes': 'C1975::NA12878',
            'genomes': 'G94982_NA12878'
        },
        'syndip': {
            'exomes': 'CHMI_CHMI3_Nex1',
            'genomes': 'CHMI_CHMI3_WGS1'
        }
    }

    mt = get_qc_samples_filtered_gnomad_data(data_type, autosomes_only=False)
    mt = mt.filter_cols(mt.s == sample_mapping[truth_sample][data_type])
    mt = mt.annotate_entries(GT=unphase_call_expr(mt.GT))
    mt = mt.key_cols_by(s=hl.str(truth_sample))
    mt = mt.repartition(1000 if data_type == 'genomes' else 100, shuffle=False)

    truth_mt = hl.read_matrix_table(NA12878_mt_path() if truth_sample ==
                                    'NA12878' else syndip_mt_path())
    truth_mt = truth_mt.key_cols_by(s=hl.str(truth_sample))
    if data_type == 'exomes':
        exome_calling_intervals = hl.import_locus_intervals(
            exome_calling_intervals_path, skip_invalid_intervals=True)
        truth_mt = truth_mt.filter_rows(
            hl.is_defined(exome_calling_intervals[truth_mt.locus]))
    truth_mt = hl.split_multi_hts(truth_mt, left_aligned=False)
    truth_mt = truth_mt.annotate_entries(GT=unphase_call_expr(truth_mt.GT))

    sample_concordance_ht, sites_concordance_ht = compute_concordance(
        mt, truth_mt, name=truth_sample)
    sites_concordance_ht.write(annotations_ht_path(
        data_type, f'{truth_sample}_concordance'),
                               overwrite=overwrite)
    sample_concordance_ht.write(sample_annotations_table_path(
        data_type, f'{truth_sample}_concordance'),
                                overwrite=overwrite)
Beispiel #21
0
def hailthread(cond1, q, cond2, qcm, inputDir, outputDir, qaws_size):

    #Load id_conversion file
    #table_idconv=hl.import_table('id_conversion')

    #Load markers files
    #table_makers_pos=hl.import_table('800k_to_extract_indexed2.txt',delimiter=':',no_header=True,impute=True)
    #table_markers_all=hl.import_table('800k_to_extract_indexed_alleles_gt2.txt',delimiter=':',no_header=True,impute=True)

    #cut -f 1 -d',' 800k_to_extract_indexed2.txt > interval_table
    #awk -F':' '{print $1"\t"$2"\t"$2}' interval_table > interval_table2

    hl.init()
    cond1.acquire()
    while not an_item_is_available(q):
        #print("Thread hail to sleep")
        #time.sleep(300)
        print("Thread hail to wait")

        cond1.wait()

    file = get_an_available_item(q)
    print("Thread hail get item " + file)
    qaws_size = qaws_size - 1
    cond1.release()

    interval_table = hl.import_locus_intervals('interval_table2',
                                               reference_genome='GRCh38')

    while file != "END":
        fileParts = file.split("/")[-1]
        fileName = fileParts.replace(".vcf.gz", "").replace(".gvcf.gz", "")
        chrName = fileName.split("_")[-3]
        #myFNAL=fileName.split("\\.")
        #myTempId=myFNAL[0]
        #Load gVCF file
        #data=hl.import_vcf("/mnt/vol1/java/gel_test.vcf",force_bgz=True,reference_genome='GRCh38')
        #data=hl.import_vcf("/mnt/vol1/java/gel_mainProgramme_aggV2_chr10_129040437_131178399.vcf.gz",force_bgz=True,reference_genome='GRCh38')
        try:

            #Extract INFO fields

            data = hl.import_vcf(inputDir + "/" + fileParts,
                                 force_bgz=True,
                                 reference_genome='GRCh38',
                                 drop_samples=True)
            #Filters PASS
            if chrName != "chrY":
                data = data.filter_rows(data.filters.size() > 0, keep=False)
            #Multiallelic
            data = hl.split_multi_hts(data)
            #Join with markers
            data_filtered = data.filter_rows(
                hl.is_defined(interval_table[data.locus]))

            data_sr = data_filtered.select_rows(
                data_filtered.info.medianDepthAll,
                data_filtered.info.medianDepthNonMiss,
                data_filtered.info.medianGQ, data_filtered.info.missingness,
                data_filtered.info.completeGTRatio, data_filtered.info.ABratio,
                data_filtered.info.MendelSite, data_filtered.info.AN,
                data_filtered.info.AC, data_filtered.info.AC_Hom,
                data_filtered.info.AC_Het)

            ht = data_sr.make_table()
            ht.export(outputDir + "/" + fileName + "_INFO.tsv")
            os.system("sed -i 's/\[//g' " + outputDir + "/" + fileName +
                      "_INFO.tsv")
            os.system("sed -i 's/]//g' " + outputDir + "/" + fileName +
                      "_INFO.tsv")
            os.system("cat " + outputDir + "/" + fileName +
                      "_INFO.tsv | grep -v locus " + " >> " + outputDir +
                      "/INFO_" + chrName)
            os.system("rm " + inputDir + "/" + fileParts)

            cond2.acquire()
            print("Thread hail make item available " + fileName)
            make_an_item_available(qcm, file)
            cond2.notify_all()
            cond2.release()
        except FatalError as e:
            print("Exception2 in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

        except AssertionError as e:
            print("Exception3 in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

        except Exception as e:
            print("Exception in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

            #raise Exception
        cond1.acquire()
        while not an_item_is_available(q):
            #print("Thread hail to sleep")
            #time.sleep(300)
            print("Thread hail to wait")
            cond1.wait()

        file = get_an_available_item(q)
        print("Thread hail get item " + file)
        qaws_size = qaws_size - 1
        cond1.release()
    time.sleep(300)
    cond2.acquire()
    print("Thread hail make END available")
    make_an_item_available(qcm, "END")
    cond2.notify_all()
    cond2.release()
Beispiel #22
0
def write_omes_concordance(data_type: str, dup_version: str, by_platform: bool,
                           overwrite: bool) -> None:
    other_data_type = 'exomes' if data_type == 'genomes' else 'genomes'

    mt = get_qc_samples_filtered_gnomad_data(data_type)
    other_mt = get_qc_samples_filtered_gnomad_data(other_data_type)

    dup_ht = hl.import_table(
        genomes_exomes_duplicate_ids_tsv_path(dup_version), impute=True)
    dup_ht = dup_ht.filter(dup_ht.dup_pair_rank == 0)

    # Unify sample names based on inferred duplicates (from pc_relate)
    mt = mt.filter_cols(hl.is_defined(dup_ht.key_by(f'{data_type}_s')[mt.s]))
    mt = mt.annotate_entries(GT=unphase_call_expr(mt.GT))
    other_mt = other_mt.key_cols_by(
        s=dup_ht.key_by(f'{other_data_type}_s')[other_mt.s][f'{data_type}_s'])
    other_mt = other_mt.filter_cols(hl.is_defined(other_mt.s))
    other_mt = other_mt.annotate_entries(GT=unphase_call_expr(other_mt.GT))

    exome_calling_intervals = hl.import_locus_intervals(
        exome_calling_intervals_path, skip_invalid_intervals=True)
    mt = mt.filter_rows(hl.is_defined(exome_calling_intervals[mt.locus]))
    other_mt = other_mt.filter_rows(
        hl.is_defined(exome_calling_intervals[other_mt.locus]))

    if by_platform:
        omes_conc = hl.read_table(
            annotations_ht_path(data_type, "omes_concordance"))
        omes_conc = omes_conc.transmute(concordance=[
            hl.struct(concordance_matrix=omes_conc.concordance,
                      n_discordant=omes_conc.n_discordant,
                      meta={
                          'data_type': 'omes',
                          'platform': 'all'
                      })
        ])
        platforms = mt.aggregate_cols(
            hl.agg.collect_as_set(mt.meta.qc_platform))
        logger.info(
            "Computing concordance by platform for platforms: {}".format(
                ",".join([str(x) for x in platforms])))

        for platform in platforms:
            plat_mt = mt.filter_cols(mt.meta.qc_platform == platform)
            _, sites_concordance_ht = compute_concordance(
                plat_mt, other_mt, name=f'omes (platform: {platform})')
            omes_conc = omes_conc.annotate(
                concordance=omes_conc.concordance.append(
                    hl.struct(concordance_matrix=sites_concordance_ht[
                        omes_conc.key].concordance,
                              n_discordant=sites_concordance_ht[
                                  omes_conc.key].n_discordant,
                              meta={
                                  'data_type': 'omes',
                                  'platform': hl.str(platform)
                              })))
        omes_conc.write(annotations_ht_path(data_type,
                                            "omes_by_platform_concordance"),
                        overwrite=overwrite)

    else:
        sample_concordance_ht, sites_concordance_ht = compute_concordance(
            mt, other_mt, name='omes')
        sites_concordance_ht.write(annotations_ht_path(data_type,
                                                       "omes_concordance"),
                                   overwrite=overwrite)
        sample_concordance_ht.write(sample_annotations_table_path(
            data_type, "omes_concordance"),
                                    overwrite=overwrite)
Beispiel #23
0
def create_binned_concordance(data_type: str, truth_sample: str, metric: str,
                              nbins: int, overwrite: bool) -> None:
    """
    Creates and writes a concordance table binned by rank (both absolute and relative) for a given data type, truth sample and metric.

    :param str data_type: One 'exomes' or 'genomes'
    :param str truth_sample: Which truth sample concordance to load
    :param str metric: One of the evaluation metrics (or a RF hash)
    :param int nbins: Number of bins for the rank
    :param bool overwrite: Whether to overwrite existing table
    :return: Nothing -- just writes the table
    :rtype: None
    """

    if hl.hadoop_exists(
            binned_concordance_path(data_type, truth_sample, metric) +
            '/_SUCCESS') and not overwrite:
        logger.warn(
            f"Skipping binned concordance creation as {binned_concordance_path(data_type, truth_sample, metric)} exists and overwrite=False"
        )
    else:
        ht = hl.read_table(
            annotations_ht_path(data_type, f'{truth_sample}_concordance'))
        # Remove 1bp indels for syndip as cannot be trusted
        if truth_sample == 'syndip':
            ht = ht.filter(
                hl.is_indel(ht.alleles[0], ht.alleles[1]) &
                (hl.abs(hl.len(ht.alleles[0]) - hl.len(ht.alleles[1])) == 1),
                keep=False)
            high_conf_intervals = hl.import_locus_intervals(
                syndip_high_conf_regions_bed_path)
        else:
            high_conf_intervals = hl.import_locus_intervals(
                NA12878_high_conf_regions_bed_path)

        lcr = hl.import_locus_intervals(lcr_intervals_path)
        segdup = hl.import_locus_intervals(segdup_intervals_path)
        ht = ht.filter(
            hl.is_defined(high_conf_intervals[ht.locus])
            & hl.is_missing(lcr[ht.locus]) & hl.is_missing(segdup[ht.locus]))

        if metric in ['vqsr', 'rf_2.0.2', 'rf_2.0.2_beta', 'cnn']:
            metric_ht = hl.read_table(score_ranking_path(data_type, metric))
        else:
            metric_ht = hl.read_table(
                rf_path(data_type, 'rf_result', run_hash=metric))

        metric_snvs, metrics_indels = metric_ht.aggregate([
            hl.agg.count_where(
                hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])),
            hl.agg.count_where(
                ~hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1]))
        ])

        snvs, indels = ht.aggregate([
            hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])),
            hl.agg.count_where(~hl.is_snp(ht.alleles[0], ht.alleles[1]))
        ])

        ht = ht.annotate_globals(global_counts=hl.struct(
            snvs=metric_snvs, indels=metrics_indels),
                                 counts=hl.struct(snvs=snvs, indels=indels))

        ht = ht.annotate(
            snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
            score=metric_ht[ht.key].score,
            global_rank=metric_ht[ht.key].rank,
            # TP => allele is found in both data sets
            n_tp=ht.concordance[3][3] + ht.concordance[3][4] +
            ht.concordance[4][3] + ht.concordance[4][4],
            # FP => allele is found only in test data set
            n_fp=hl.sum(ht.concordance[3][:2]) + hl.sum(ht.concordance[4][:2]),
            # FN => allele is found only in truth data set
            n_fn=hl.sum(ht.concordance[:2].map(lambda x: x[3] + x[4])))

        ht = add_rank(ht, -1.0 * ht.score)

        ht = ht.annotate(rank=[
            hl.tuple([
                'global_rank', (ht.global_rank + 1) /
                hl.cond(ht.snv, ht.globals.global_counts.snvs,
                        ht.globals.global_counts.indels)
            ]),
            hl.tuple([
                'truth_sample_rank', (ht.rank + 1) / hl.cond(
                    ht.snv, ht.globals.counts.snvs, ht.globals.counts.indels)
            ])
        ])

        ht = ht.explode(ht.rank)
        ht = ht.annotate(rank_name=ht.rank[0], bin=hl.int(ht.rank[1] * nbins))

        ht = ht.group_by('rank_name', 'snv', 'bin').aggregate(
            # Look at site-level metrics -> tp > fp > fn -- only important for multi-sample comparisons
            tp=hl.agg.count_where(ht.n_tp > 0),
            fp=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp > 0)),
            fn=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp == 0)
                                  & (ht.n_fn > 0)),
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n_alleles=hl.agg.count()).repartition(5)

        ht.write(binned_concordance_path(data_type, truth_sample, metric),
                 overwrite=overwrite)
Beispiel #24
0
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(global_field_1=5,
                         global_field_2=10,
                         pli={'SCN1A': 0.999, 'SONIC': 0.014},
                         populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
ds = ds.annotate_rows(gene=['TTN'])
ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
ds.write('data/example.vds', overwrite=True)

lmmreg_ds = hl.variant_qc(hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz')))
lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True)
lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']])
lmmreg_ds = lmmreg_ds.annotate_rows(use_in_kinship = lmmreg_ds.variant_qc.AF[1] > 0.05)
lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True)

burden_ds = hl.import_vcf('data/example_burden.vcf')
burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True)
burden_ds = burden_ds.annotate_cols(burden = burden_kt[burden_ds.s])
burden_ds = burden_ds.annotate_rows(weight = hl.float64(burden_ds.locus.position))
burden_ds = hl.variant_qc(burden_ds)
genekt = hl.import_locus_intervals('data/gene.interval_list')
burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
burden_ds.write('data/example_burden.vds', overwrite=True)
Beispiel #25
0
    global_field_1=5,
    global_field_2=10,
    pli={
        'SCN1A': 0.999,
        'SONIC': 0.014
    },
    populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
ds = ds.annotate_rows(gene=['TTN'])
ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
ds.write('data/example.vds', overwrite=True)

lmmreg_ds = hl.variant_qc(
    hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz')))
lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True)
lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']])
lmmreg_ds = lmmreg_ds.annotate_rows(
    use_in_kinship=lmmreg_ds.variant_qc.AF[1] > 0.05)
lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True)

burden_ds = hl.import_vcf('data/example_burden.vcf')
burden_kt = hl.import_table('data/example_burden.tsv',
                            key='Sample',
                            impute=True)
burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
burden_ds = burden_ds.annotate_rows(
    weight=hl.float64(burden_ds.locus.position))
burden_ds = hl.variant_qc(burden_ds)
genekt = hl.import_locus_intervals('data/gene.interval_list')
burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
burden_ds.write('data/example_burden.vds', overwrite=True)
Beispiel #26
0
def get_lcr_intervals() -> hl.Table:
    return hl.import_locus_intervals('gs://gnomad-public/resources/grch38/LCRFromHengHg38.txt', reference_genome='GRCh38', skip_invalid_intervals=True)
pprint(n_6_multi)

# 02_prefilter_variants.py
INITIAL_VARIANT_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter.keep.variant_list'
INITIAL_VARIANT_QC_FILE  = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter_metrics.tsv'

# Read in the target intervals
TARGET_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/ice_coding_v1_targets.interval_list'
# Read in the padded target intervals (50bp padding)
PADDED_TARGET_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/ice_coding_v1_padded_targets.interval_list'

# Low complexity regions in the data.
LCRs = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/LCR-hs38.bed'

# Import the interval lists for the target intervals.
target_intervals = hl.import_locus_intervals(TARGET_INTERVALS, reference_genome='GRCh38')
# Import the interval lists for the padded target intervals.
padded_target_intervals = hl.import_locus_intervals(PADDED_TARGET_INTERVALS, reference_genome='GRCh38')
# Import the interval lists for the LCRs.
LCR_intervals = hl.import_locus_intervals(LCRs, reference_genome='GRCh38')

# Annotate variants with flag indicating if they are in LCR or failed VQSR.
mt = mt.annotate_rows(fail_VQSR = hl.len(mt.filters) != 0)
mt = mt.annotate_rows(in_LCR = hl.is_defined(LCR_intervals[mt.locus]))
mt = mt.annotate_rows(not_in_target_intervals = ~hl.is_defined(target_intervals[mt.locus]))
mt = mt.annotate_rows(not_in_padded_target_intervals = ~hl.is_defined(padded_target_intervals[mt.locus]))

# Get information about the number of variants that were excluded.
fail_VQSR = mt.filter_rows(mt.fail_VQSR).count_rows()
in_LCR = mt.filter_rows(mt.in_LCR).count_rows()
not_in_target_intervals = mt.filter_rows(mt.not_in_target_intervals).count_rows()
Beispiel #28
0
 def test_import_locus_intervals_no_reference_specified(self):
     interval_file = resource('annotinterall.interval_list')
     t = hl.import_locus_intervals(interval_file, reference_genome=None)
     self.assertTrue(t.count() == 2)
     self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
def create_binned_data(ht: hl.Table, data: str, data_type: str,
                       n_bins: int) -> hl.Table:
    """
    Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton
    containing the information needed for evaluation plots.

    :param Table ht: Input rank table
    :param str data: Which data/run hash is being created
    :param str data_type: one of 'exomes' or 'genomes'
    :param int n_bins: Number of bins.
    :return: Binned Table
    :rtype: Table
    """

    # Count variants for ranking
    count_expr = {
        x: hl.agg.filter(
            hl.is_defined(ht[x]),
            hl.agg.counter(
                hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv',
                        'indel')))
        for x in ht.row if x.endswith('rank')
    }
    rank_variant_counts = ht.aggregate(hl.Struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(rank_variant_counts)}"
    )
    ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts)

    # Load external evaluation data
    clinvar_ht = hl.read_table(clinvar_ht_path)
    denovo_ht = get_validated_denovos_ht()
    if data_type == 'exomes':
        denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality)
    else:
        denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality)
    denovo_ht = denovo_ht.select(
        validated_denovo=denovo_ht.validated,
        high_confidence_denovo=denovo_ht.Confidence == 'HIGH')
    ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data'))
    fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats'))
    fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0])
    gnomad_ht = get_gnomad_data(data_type).rows()
    gnomad_ht = gnomad_ht.select(
        vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE,
        vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE,
        fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) |
        (gnomad_ht.info.MQ < 30))
    lcr_intervals = hl.import_locus_intervals(lcr_intervals_path)

    ht = ht.annotate(
        **ht_truth_data[ht.key],
        **fam_ht[ht.key],
        **gnomad_ht[ht.key],
        **denovo_ht[ht.key],
        clinvar=hl.is_defined(clinvar_ht[ht.key]),
        indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()),
        rank_bins=hl.array([
            hl.Struct(
                rank_id=rank_name,
                bin=hl.int(
                    hl.ceil(
                        hl.float(ht[rank_name] + 1) / hl.floor(
                            ht.globals.rank_variant_counts[rank_name][hl.cond(
                                hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv',
                                'indel')] / n_bins))))
            for rank_name in rank_variant_counts
        ]),
        lcr=hl.is_defined(lcr_intervals[ht.locus]))

    ht = ht.explode(ht.rank_bins)
    ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin)
    ht = ht.filter(hl.is_defined(ht.bin))

    ht = ht.checkpoint(
        f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht',
        overwrite=True)

    # Create binned data
    return (ht.group_by(
        rank_id=ht.rank_id,
        contig=ht.locus.contig,
        snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
        bi_allelic=hl.is_defined(ht.biallelic_rank),
        singleton=ht.singleton,
        release_adj=ht.ac > 0,
        bin=ht.bin)._set_buffer_size(20000).aggregate(
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n=hl.agg.count(),
            n_ins=hl.agg.count_where(
                hl.is_insertion(ht.alleles[0], ht.alleles[1])),
            n_del=hl.agg.count_where(
                hl.is_deletion(ht.alleles[0], ht.alleles[1])),
            n_ti=hl.agg.count_where(
                hl.is_transition(ht.alleles[0], ht.alleles[1])),
            n_tv=hl.agg.count_where(
                hl.is_transversion(ht.alleles[0], ht.alleles[1])),
            n_1bp_indel=hl.agg.count_where(ht.indel_length == 1),
            n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0),
            n_clinvar=hl.agg.count_where(ht.clinvar),
            n_singleton=hl.agg.count_where(ht.singleton),
            n_validated_de_novos=hl.agg.count_where(ht.validated_denovo),
            n_high_confidence_de_novos=hl.agg.count_where(
                ht.high_confidence_denovo),
            n_de_novo=hl.agg.filter(
                ht.family_stats.unrelated_qc_callstats.AC[1] == 0,
                hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_no_lcr=hl.agg.filter(
                ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0),
                hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_sites=hl.agg.filter(
                ht.family_stats.unrelated_qc_callstats.AC[1] == 0,
                hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_de_novo_sites_no_lcr=hl.agg.filter(
                ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0),
                hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_trans_singletons=hl.agg.filter(
                (ht.info_ac < 3) &
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1),
                hl.agg.sum(ht.family_stats.tdt.t)),
            n_untrans_singletons=hl.agg.filter(
                (ht.info_ac < 3) &
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1),
                hl.agg.sum(ht.family_stats.tdt.u)),
            n_train_trans_singletons=hl.agg.count_where(
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1)
                & (ht.family_stats.tdt.t == 1)),
            n_omni=hl.agg.count_where(ht.truth_data.omni),
            n_mills=hl.agg.count_where(ht.truth_data.mills),
            n_hapmap=hl.agg.count_where(ht.truth_data.hapmap),
            n_kgp_high_conf_snvs=hl.agg.count_where(
                ht.truth_data.kgp_high_conf_snvs),
            fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters),
            n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site),
            n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
Beispiel #30
0
 def test_import_locus_intervals_no_reference_specified(self):
     interval_file = resource('annotinterall.interval_list')
     t = hl.import_locus_intervals(interval_file, reference_genome=None)
     self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
Beispiel #31
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl
    doctest_namespace['np'] = np

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")
Beispiel #32
0
logreg_pop_file = 'gs://ccdg-qc-multi/data/IBD/logreg_results_'
samples_file = 'gs://ccdg-qc-multi/data/IBD/samples_file.tsv'
plink_file = 'gs://ccdg-qc-multi/data/IBD/ibdvars'

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# read data
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

vds = hl.read_matrix_table(vds_splitmulti_file)
vds_immune = hl.read_matrix_table(vds_immune_file)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# filter variants
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

ibd_vars = hl.import_locus_intervals(ibdvars_file)
intervallist = [x.interval for x in ibd_vars.collect()]
manuelvars = hl.import_locus_intervals(manuelvars_file)
intervallist = intervallist + [x.interval for x in manuelvars.collect()]

vds = hl.filter_intervals(vds, intervallist, keep=True)
##vds = vds.filter_rows(vds.qcpass)

vds_immune = hl.filter_intervals(vds_immune, intervallist, keep=True)
vds_immune = hl.split_multi_hts(
    vds_immune.select_entries(vds_immune.GT, vds_immune.AD, vds_immune.DP,
                              vds_immune.GQ, vds_immune.PL))
vds_combined = vds_immune.union_cols(vds)

vds_combined = vds_combined.naive_coalesce(1000)
vds_combined.write(filtered_vds_combined_file, overwrite=True)
MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT.hardcalls.mt'
# Read in the hard calls matrix table.
mt = hl.read_matrix_table(MT_HARDCALLS)

# Read in the target intervals
TARGET_INTERVALS = 'gs://dalio_bipolar_w1_w2_hail_02/whole_exome_illumina_coding_v1.Homo_sapiens_assembly19.targets.fixed.interval_list'

# Low complexity regions in the data.
LCRs = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/inputs_low_complexity_regions_b37.interval_list'

INITIAL_VARIANT_QC_FILE = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter_metrics_b37_callset.tsv'
INITIAL_VARIANT_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter_b37_callset.keep.variant_list'
INITIAL_VARIANT_AUTO_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter_b37_callset.keep.autosome.variant_list'

# Import the interval lists for the LCRs.
target_intervals = hl.import_locus_intervals(TARGET_INTERVALS)
LCR_intervals = hl.import_locus_intervals(LCRs)

# Annotate variants with flag indicating if they are in LCR or failed VQSR.
mt = mt.annotate_rows(fail_VQSR=hl.len(mt.filters) != 0)
mt = mt.annotate_rows(in_LCR=hl.is_defined(LCR_intervals[mt.locus]))
mt = mt.annotate_rows(
    not_in_target_intervals=~hl.is_defined(target_intervals[mt.locus]))

# Get information about the number of variants that were excluded.
fail_VQSR = mt.filter_rows(mt.fail_VQSR).count_rows()
in_LCR = mt.filter_rows(mt.in_LCR).count_rows()
not_in_target_intervals = mt.filter_rows(
    mt.not_in_target_intervals).count_rows()

print('n variants failing VQSR:')
Beispiel #34
0
vds_1kg_file = 'gs://ccdg-qc-multi/data/1000genomes/vds/hail2_ALL.GRCh38.genotypes.20170504.vds'
mhc_chr8inv_file = 'gs://ccdg-qc-multi/data/MHC_invchr8_longLDreg_liftover_to_GRCh38.txt'
rel_exclusion_file = 'gs://ccdg-qc-multi/out/king/' + chrom + '/ibd_greater_0884_' + chrom + '.txt'
samples_to_keep_file = 'gs://ccdg-qc-multi/qc_measures/' + chrom + '/01_sample_qc_keep.txt'

# output
pca_value_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_values.tsv'
pca_score_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_scores.tsv'

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# read data
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

## interval list
#mhc_chr8inv = hl.import_table(mhc_chr8inv_file, no_header=True).key_by('f0')
mhc_chr8inv = hl.import_locus_intervals(mhc_chr8inv_file)

##
rel_exclusion = hl.import_table(rel_exclusion_file,
                                no_header=True).key_by('f0')

vds = hl.read_matrix_table(vds_ldpruned_common_file)
onekg = hl.read_matrix_table(vds_1kg_file)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# rename samples
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print('annotate 1KG...')
onekg = onekg.annotate_cols(s='1KG_' + onekg.s)
Beispiel #35
0
HIGH_LD_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2/inputs/b38_high_ld.bed'
INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list'
INITIAL_VARIANT_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/02_prefilter.keep.variant_list'

ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0')
ht_initial_variants = hl.import_table(INITIAL_VARIANT_LIST,
                                      types={
                                          'locus':
                                          hl.tlocus(reference_genome='GRCh38'),
                                          'alleles':
                                          hl.tarray(hl.tstr)
                                      })
ht_initial_variants = ht_initial_variants.key_by(ht_initial_variants.locus,
                                                 ht_initial_variants.alleles)

high_LD_intervals = hl.import_locus_intervals(HIGH_LD_INTERVALS,
                                              reference_genome='GRCh38')

mt = hl.read_matrix_table(MT_HARDCALLS)
mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key]))
mt = mt.annotate_rows(in_high_LD=hl.is_defined(high_LD_intervals[mt.locus]))

mt = mt.filter_rows(
    hl.is_defined(ht_initial_variants[mt.row_key]) & (~mt.in_high_LD))
mt = mt.filter_rows(mt.locus.in_x_nonpar() | mt.locus.in_autosome_or_par())
mt = hl.variant_qc(mt, name='qc')
mt = mt.filter_rows((mt.qc.AF[0] > 0.01) & (mt.qc.AF[0] < 0.99)
                    & ((mt.qc.call_rate > 0.98) | mt.locus.in_x_nonpar()
                       | mt.locus.in_x_par())).persist()

mt.count()