Esempio n. 1
0
    def test_annotate_intervals(self):
        ds = get_dataset()

        bed1 = hl.import_bed(resource('example1.bed'), reference_genome='GRCh37')
        bed2 = hl.import_bed(resource('example2.bed'), reference_genome='GRCh37')
        bed3 = hl.import_bed(resource('example3.bed'), reference_genome='GRCh37')
        self.assertTrue(list(bed2.key.dtype) == ['interval'])
        self.assertTrue(list(bed2.row.dtype) == ['interval', 'target'])

        interval_list1 = hl.import_locus_intervals(resource('exampleAnnotation1.interval_list'))
        interval_list2 = hl.import_locus_intervals(resource('exampleAnnotation2.interval_list'))
        self.assertTrue(list(interval_list2.key.dtype) == ['interval'])
        self.assertTrue(list(interval_list2.row.dtype) == ['interval', 'target'])

        ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows()
        self.assertTrue(ann.all((ann.locus.position <= 14000000) |
                                (ann.locus.position >= 17000000) |
                                (hl.is_missing(ann.in_interval))))

        for bed in [bed2, bed3]:
            ann = ds.annotate_rows(target=bed[ds.locus].target).rows()
            expr = (hl.case()
                    .when(ann.locus.position <= 14000000, ann.target == 'gene1')
                    .when(ann.locus.position >= 17000000, ann.target == 'gene2')
                    .default(ann.target == hl.null(hl.tstr)))
            self.assertTrue(ann.all(expr))

        self.assertTrue(ds.annotate_rows(in_interval=interval_list1[ds.locus]).rows()
                        ._same(ds.annotate_rows(in_interval=bed1[ds.locus]).rows()))

        self.assertTrue(ds.annotate_rows(target=interval_list2[ds.locus].target).rows()
                        ._same(ds.annotate_rows(target=bed2[ds.locus].target).rows()))
Esempio n. 2
0
    def test_import_bed_badly_defined_intervals(self):
        bed_file = resource('example4.bed')
        t = hl.import_bed(bed_file, reference_genome='GRCh37', skip_invalid_intervals=True)
        self.assertTrue(t.count() == 3)

        t = hl.import_bed(bed_file, reference_genome=None, skip_invalid_intervals=True)
        self.assertTrue(t.count() == 4)
Esempio n. 3
0
    def test_import_bed_badly_defined_intervals(self):
        bed_file = resource('example4.bed')
        t = hl.import_bed(bed_file,
                          reference_genome='GRCh37',
                          skip_invalid_intervals=True)
        self.assertTrue(t.count() == 3)

        t = hl.import_bed(bed_file,
                          reference_genome=None,
                          skip_invalid_intervals=True)
        self.assertTrue(t.count() == 4)
Esempio n. 4
0
def get_cnt_matrix(mnv_table, region="ALL", dist=1, minimum_cnt=0, PASS=True, part_size=1000, hom=False):
    # mnv_table = hail table of mnvs
    # region = bed file, defining the regions of interest (e.g. enhancer region)
    # dist = distance between two SNPs
    # PASS=True: restrict to both pass variants
    # we don't care indels anymore
    # filter by region, if you give a bed file path as region
    if region != "ALL":
        bed = hl.import_bed(region)
        mnv_table = mnv_table.filter(hl.is_defined(bed[mnv_table.locus]))
    if PASS=="NO":#exclusively getting at least one non-pass ones
        mnv_table = mnv_table.filter((mnv_table.filters.length() > 0) | (mnv_table.prev_filters.length() > 0))
    elif PASS==True:
        mnv_table = mnv_table.filter((mnv_table.filters.length() == 0) & (mnv_table.prev_filters.length() == 0))
    if hom:
        mnv_table = mnv_table.filter(mnv_table.n_homhom>0)
    # count MNV occurance -- restricting to SNPs
    mnv = mnv_table.filter((mnv_table.alleles[0].length() == 1) &
                           (mnv_table.alleles[1].length() == 1) &
                           (mnv_table.prev_alleles[0].length() == 1) &
                           (mnv_table.prev_alleles[1].length() == 1) &
                           ((
                            mnv_table.locus.position - mnv_table.prev_locus.position) == dist))  # filter to that specific distance

    #repartition to proper size
    mnv = mnv.repartition(part_size)

    mnv_cnt = mnv.group_by("alleles", "prev_alleles").aggregate(cnt=agg.count())  # count occurance
    mnv_cnt = mnv_cnt.annotate(
        refs=mnv_cnt.prev_alleles[0] + "N" * (dist - 1) + mnv_cnt.alleles[0])  # annotate combined refs
    mnv_cnt = mnv_cnt.annotate(
        alts=mnv_cnt.prev_alleles[1] + "N" * (dist - 1) + mnv_cnt.alleles[1])  # annotate combined alts

    if minimum_cnt > 0: mnv_cnt = mnv_cnt.filter((mnv_cnt.cnt > minimum_cnt))  # remove trivial ones
    return (mnv_cnt.select("refs", "alts", "cnt"))
Esempio n. 5
0
def get_cnt_matrix_alldist(mnv_table, region="ALL", dist_min=1, dist_max=10, minimum_cnt=0, PASS=True, part_size=1000):
    #give a distance range, instead of single distance
    if region != "ALL":
        bed = hl.import_bed(region, skip_invalid_intervals=True)
        mnv_table = mnv_table.filter(hl.is_defined(bed[mnv_table.locus]))
    if PASS:
        mnv_table = mnv_table.filter((mnv_table.filters.length() == 0) & (mnv_table.prev_filters.length() == 0))

    # count MNV occurance -- restricting to SNPs
    mnv_table = mnv_table.filter((mnv_table.alleles[0].length() == 1) &
                           (mnv_table.alleles[1].length() == 1) &
                           (mnv_table.prev_alleles[0].length() == 1) &
                           (mnv_table.prev_alleles[1].length() == 1) )
    pdall = {}
    for dist in range(dist_min, (dist_max+1)):
        mnv = mnv_table.filter((mnv_table.locus.position - mnv_table.prev_locus.position) == dist)  # filter to that specific distance

        #repartition to proper size
        mnv = mnv.repartition(part_size)

        mnv_cnt = mnv.group_by("alleles", "prev_alleles").aggregate(cnt=agg.count())  # count occurance
        mnv_cnt = mnv_cnt.annotate(
            refs=mnv_cnt.prev_alleles[0] + "N" * (dist - 1) + mnv_cnt.alleles[0])  # annotate combined refs
        mnv_cnt = mnv_cnt.annotate(
            alts=mnv_cnt.prev_alleles[1] + "N" * (dist - 1) + mnv_cnt.alleles[1])  # annotate combined alts

        if minimum_cnt > 0: mnv_cnt = mnv_cnt.filter((mnv_cnt.cnt > minimum_cnt))  # remove trivial ones
        pdall[dist] = ht_cnt_mat_to_pd(mnv_cnt.select("refs", "alts", "cnt")) #saving as pandas dataframe, in dictionary
        print ("done d={0}".format(dist))
        print(tm.ctime())
    return (pdall) #returning a dictionary of dataframe
Esempio n. 6
0
    def test_annotate_intervals(self):
        ds = get_dataset()

        bed1 = hl.import_bed(resource('example1.bed'),
                             reference_genome='GRCh37')
        bed2 = hl.import_bed(resource('example2.bed'),
                             reference_genome='GRCh37')
        bed3 = hl.import_bed(resource('example3.bed'),
                             reference_genome='GRCh37')
        self.assertTrue(list(bed2.key.dtype) == ['interval'])
        self.assertTrue(list(bed2.row.dtype) == ['interval', 'target'])

        interval_list1 = hl.import_locus_intervals(
            resource('exampleAnnotation1.interval_list'))
        interval_list2 = hl.import_locus_intervals(
            resource('exampleAnnotation2.interval_list'))
        self.assertTrue(list(interval_list2.key.dtype) == ['interval'])
        self.assertTrue(
            list(interval_list2.row.dtype) == ['interval', 'target'])

        ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows()
        self.assertTrue(
            ann.all((ann.locus.position <= 14000000)
                    | (ann.locus.position >= 17000000)
                    | (hl.is_missing(ann.in_interval))))

        for bed in [bed2, bed3]:
            ann = ds.annotate_rows(target=bed[ds.locus].target).rows()
            expr = (hl.case().when(ann.locus.position <= 14000000,
                                   ann.target == 'gene1').when(
                                       ann.locus.position >= 17000000,
                                       ann.target == 'gene2').default(
                                           ann.target == hl.null(hl.tstr)))
            self.assertTrue(ann.all(expr))

        self.assertTrue(
            ds.annotate_rows(
                in_interval=interval_list1[ds.locus]).rows()._same(
                    ds.annotate_rows(in_interval=bed1[ds.locus]).rows()))

        self.assertTrue(
            ds.annotate_rows(
                target=interval_list2[ds.locus].target).rows()._same(
                    ds.annotate_rows(target=bed2[ds.locus].target).rows()))
def overlap_with_file(mt: hl.MatrixTable, bed) -> hl.MatrixTable:
    '''

    :param mt: a matrixtable
    :param bed: the baits file with coordinates for which to filter matrixtable
    :return: a matrixtable with variants overlapping with baits file only
    '''
    baits = hl.import_bed(bed, reference_genome='GRCh38')
    overlapping_mt = mt.filter_rows(hl.is_defined(baits[mt.locus]))
    return overlapping_mt
Esempio n. 8
0
def get_telomeres_and_centromeres_ht(overwrite: bool = False) -> hl.Table:
    tc_interval = hl.import_bed(
        f'{nfs_dir}/resources/grch38/hg38.telomeresAndMergedCentromeres.bed',
        skip_invalid_intervals=True,
        min_partitions=10,
        reference_genome='GRCh38')
    return tc_interval.checkpoint(
        f'{nfs_dir}/resources/grch38/hg38.telomeresAndMergedCentromeres.ht',
        overwrite=overwrite,
        _read_if_exists=not overwrite)
Esempio n. 9
0
def get_segdups_ht(overwrite: bool = False) -> hl.Table:
    segdup_interval = hl.import_bed(
        f'{nfs_dir}/resources/grch38/GRCh38_segdups.bed',
        skip_invalid_intervals=True,
        min_partitions=50,
        reference_genome='GRCh38')
    return segdup_interval.checkpoint(
        f'{nfs_dir}/resources/grch38/GRCh38_segdups.ht',
        overwrite=overwrite,
        _read_if_exists=not overwrite)
Esempio n. 10
0
def import_intervals_from_bed(bed_path: str, platform_label: str,
                              genome_ref: str) -> hl.Table:
    """
    Handle importing BED files as intervals. Recode contig if necessary and
    annotate global meta-info.
    Note: `platform_label` and `genome_ref` are required, since these info
           will be used as global annotations.

    :param bed_path: Path to capture interval BED file
    :param platform_label: Unique capture interval identifier (e.g. 'ssv3')
    :param genome_ref: Either 'GRCh37' or 'GRCh38

    :return: HailTable keyed by interval
    """

    # genome references
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')

    # dict contig recode from rg38 -> rg37.
    # only autosomal and sex chromosomes
    CONTIG_RECODING_HG38_TO_HG37 = {
        contig: contig.replace('chr', '')
        for contig in rg38.contigs[:24]
    }

    # dict contig recode from rg37 -> rg38.
    # only autosomal and sex chromosomes
    CONTIG_RECODING_HG37_TO_HG38 = {
        CONTIG_RECODING_HG38_TO_HG37.get(k): k
        for k in CONTIG_RECODING_HG38_TO_HG37.keys()
    }

    # Recode contig if chromosome field in BED file miss-match with genome reference.
    if genome_ref == 'GRCh37':
        contig_recoding = CONTIG_RECODING_HG38_TO_HG37
    elif genome_ref == 'GRCh38':
        contig_recoding = CONTIG_RECODING_HG37_TO_HG38
    else:
        contig_recoding = None

    ht_intervals = hl.import_bed(bed_path,
                                 reference_genome=genome_ref,
                                 contig_recoding=contig_recoding)

    global_ann_expr = dict(
        zip(GLOBAL_ANNOTATION_FIELDS,
            (current_date(), bed_path, genome_ref, platform_label)))

    ht_intervals = (ht_intervals.annotate_globals(
        **global_ann_expr).key_by('interval').repartition(100))
    return ht_intervals
Esempio n. 11
0
    def test_import_bed(self):
        bed_file = resource('example1.bed')
        bed = hl.import_bed(bed_file, reference_genome='GRCh37')

        nbed = bed.count()
        i = 0
        with open(bed_file) as f:
            for line in f:
                if len(line.strip()) != 0:
                    try:
                        int(line.split()[0])
                        i += 1
                    except:
                        pass
        self.assertEqual(nbed, i)

        self.assertEqual(bed.interval.dtype.point_type, hl.tlocus('GRCh37'))

        bed_file = resource('example2.bed')
        t = hl.import_bed(bed_file, reference_genome='GRCh37')
        self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37'))
        self.assertTrue(list(t.key.dtype) == ['interval'])
        self.assertTrue(list(t.row.dtype) == ['interval','target'])
Esempio n. 12
0
    def test_import_bed(self):
        bed_file = resource('example1.bed')
        bed = hl.import_bed(bed_file, reference_genome='GRCh37')

        nbed = bed.count()
        i = 0
        with open(bed_file) as f:
            for line in f:
                if len(line.strip()) != 0:
                    try:
                        int(line.split()[0])
                        i += 1
                    except:
                        pass
        self.assertEqual(nbed, i)

        self.assertEqual(bed.interval.dtype.point_type, hl.tlocus('GRCh37'))

        bed_file = resource('example2.bed')
        t = hl.import_bed(bed_file, reference_genome='GRCh37')
        self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37'))
        self.assertTrue(list(t.key.dtype) == ['interval'])
        self.assertTrue(list(t.row.dtype) == ['interval', 'target'])
Esempio n. 13
0
def filter_low_conf_regions(
        mt: hl.MatrixTable,
        filter_lcr: bool = True,
        filter_decoy: bool = True,
        filter_segdup: bool = True,
        high_conf_regions: Optional[List[str]] = None) -> hl.MatrixTable:
    """
    Filters low-confidence regions

    :param MatrixTable mt: MT to filter
    :param bool filter_lcr: Whether to filter LCR regions
    :param bool filter_decoy: Whether to filter decoy regions
    :param bool filter_segdup: Whether to filter Segdup regions
    :param list of str high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions)
    :return: MT with low confidence regions removed
    :rtype: MatrixTable
    """
    from gnomad_hail.resources import lcr_intervals_path, decoy_intervals_path, segdup_intervals_path

    if filter_lcr:
        lcr = hl.import_locus_intervals(lcr_intervals_path)
        mt = mt.filter_rows(hl.is_defined(lcr[mt.locus]), keep=False)

    if filter_decoy:
        decoy = hl.import_bed(decoy_intervals_path)
        mt = mt.filter_rows(hl.is_defined(decoy[mt.locus]), keep=False)

    if filter_segdup:
        segdup = hl.import_bed(segdup_intervals_path)
        mt = mt.filter_rows(hl.is_defined(segdup[mt.locus]), keep=False)

    if high_conf_regions is not None:
        for region in high_conf_regions:
            region = hl.import_locus_intervals(region)
            mt = mt.filter_rows(hl.is_defined(region[mt.locus]), keep=True)

    return mt
Esempio n. 14
0
if __name__ == "__main__":
    # need to create spark cluster first before intiialising hail
    sc = pyspark.SparkContext()
    # Define the hail persistent storage directory
    tmp_dir = "hdfs://spark-master:9820/"
    temp_dir = os.path.join(os.environ["HAIL_HOME"], "tmp")
    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38")
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    bed_to_exclude_pca = hl.import_bed(
        f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38')
    cohorts_pop = hl.import_table(
        "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb.tsv", delimiter="\t").key_by('s')

    mt = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_pca_scores.mt")
    # mt = mt.annotate_cols(
    #    loadings=pca_loadings[mt_vqc_filtered.col_key].loadings)
    # mt = mt.annotate_cols(known_pop="unk")
    # pca_scores = pca_scores.annotate(known_pop="unk")

    pca_scores = hl.read_table(
        f"{temp_dir}/ddd-elgh-ukbb/pca_scores_known_pop.ht")
    pca_loadings = hl.read_table(f"{temp_dir}/ddd-elgh-ukbb/pca_loadings.ht")
    logger.info("assign population pcs")
   # population_assignment_table = assign_population_pcs(
variant_list_file = 'gs://rcstorage/qced/' + chrom + '/qced_' + chrom + '_variant_list.txt'

# define output files
sample_qc_info_postqc_file = 'gs://rcstorage/qced/' + chrom + '/sample_qc_info_postqc_revisegt.txt'

print("importing vds files...")
vds = hl.read_matrix_table(vds_splitmulti_file)
num0 = vds.count()
print(num0)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# II. Remove LCR
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("removing lcr...")
lcr = hl.import_bed(lcr_file, reference_genome='GRCh38')
vds = vds.filter_rows(hl.is_defined(lcr[vds.locus]), keep=False)
num1 = vds.count()
print(num1)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# III. Annotate variants with PASS or FAIL
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("annotating variants...")

variant_list = hl.import_table(variant_list_file)

variant_list_post = variant_list.add_index()
variant_list_post = variant_list_post.key_by('idx')
vds_post = vds.add_row_index()
Esempio n. 16
0
 def test_import_bed_no_reference_specified(self):
     bed_file = resource('example1.bed')
     t = hl.import_bed(bed_file, reference_genome=None)
     self.assertTrue(t.count() == 3)
     self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
Esempio n. 17
0
        'locus').distinct_by_row().key_rows_by('locus', 'alleles')
    mt_split = hl.split_multi_hts(mt_annotated,
                                  keep_star=False,
                                  left_aligned=False)

    mt = mt_split.annotate_rows(Variant_Type=hl.cond(
        (hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP",
        hl.cond(
            hl.is_insertion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL",
            hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]),
                    "INDEL", "Other"))))

    mt = mt.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_cohorts.mt",
        overwrite=True)
    print("Finished splitting and writing mt. ")

    agilent_table = hl.import_bed(agilent, reference_genome='GRCh38')
    mt_agilent = mt.filter_rows(hl.is_defined(agilent_table[mt.locus]))

    mt_agilent = hl.sample_qc(mt_agilent, name='sample_QC_Hail')
    pandadf1 = mt_agilent.cols().flatten()
    print("Outputting table of sample qc")
    pandadf1.export(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_agilent_sampleQC.tsv.bgz",
        header=True)

    mt = mt.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-sampleqc-unfiltered_annotated.mt",
        overwrite=True)
Esempio n. 18
0
        types={
            "Locus": "locus<GRCh38>",
            "VQSLOD": hl.tfloat64
        })
    VQSLOD_indels = hl.import_table(
        f'{lustre_dir}/intervalwgs-qc/VQSLOD_indels.bgz',
        types={
            "Locus": "locus<GRCh38>",
            "VQSLOD": hl.tfloat64
        })
    sample_QC_nonHail = hl.import_table(
        f'{lustre_dir}/intervalwgs-qc/INTERVAL_WGS_Sample_QC_04-09-2019.txt',
        impute=True)

    centromere_table = hl.import_bed(
        f'{lustre_dir}/intervalwgs-qc/Centromere_region_UCSC_GRCh38.bed',
        reference_genome='GRCh38',
        min_partitions=250)

    #####################################################################
    ###################### INPUT DATA  ##############################
    #####################################################################
    # Give chromosome as input to program with chr prefix i.e chr1, chr2, chr3 etc
    CHROMOSOME = "chr1"
    print(f"Reading {CHROMOSOME} mt")

    mt = hl.read_matrix_table(f'{lustre_dir}/chr1.mt')
    #mt = hl.read_matrix_table(f"{temp_dir}/matrixtables/{CHROMOSOME}.mt")

    print("Splitting mt and writing out split mt")
    mt_split = hl.split_multi_hts(mt, keep_star=False, left_aligned=False)
Esempio n. 19
0
def get_baselevel_expression_for_genes(
        mt,
        gtex,
        gene_list=None,
        get_proportions=None,
        gene_maximums_ht_path=gtex_v7_gene_maximums_ht_path):
    gtex_table = gtex.key_by("transcript_id")

    if gene_list:
        genes = hl.literal(gene_list)

        # Filter context_ht to genes of interest
        mt = mt.annotate_rows(in_gene_of_interest=genes.find(
            lambda x: mt.vep.transcript_consequences.any(lambda tc: tc.
                                                         gene_symbol == x)))
        mt = mt.filter_rows(mt.in_gene_of_interest != "NA")

    # Need to modify process consequences to ignore splice variants, because these can occur on intronic regions

    all_coding_minus_splice = list(
        set(all_coding_csqs) - set([
            'splice_acceptor_variant', 'splice_donor_variant',
            'splice_region_variant'
        ]))

    def add_most_severe_consequence_to_consequence_minus_splice(
            tc: hl.expr.StructExpression) -> hl.expr.StructExpression:
        """
        Copied from gnomad_hail but slight change
        """

        csqs = hl.literal(all_coding_minus_splice)
        return tc.annotate(most_severe_consequence=csqs.find(
            lambda c: tc.consequence_terms.contains(c)))

    # Add worst consequence within transcript consequences
    mt = (mt.annotate_rows(vep=mt.vep.annotate(
        transcript_consequences=mt.vep.transcript_consequences.map(
            add_most_severe_consequence_to_consequence_minus_splice))))

    # Explode on transcript consequences
    mt = mt.explode_rows(mt.vep.transcript_consequences)
    mt_kt = mt.rows()

    # Filter to positions in the CDS regions
    cds_intervals = hl.import_bed(
        "gs://gnomad-public/papers/2019-tx-annotation/data/other_data/gencode.v19.CDS.Hail.021519.bed"
    )
    mt_kt = mt_kt.annotate(in_cds=hl.is_defined(cds_intervals[mt_kt.locus]))
    mt_kt = mt_kt.filter(mt_kt.in_cds)

    # Filter to protein coding transcripts only
    mt_kt = mt_kt.filter(
        mt_kt.vep.transcript_consequences.biotype == "protein_coding")

    # Filter to coding variants to only evalute those effects
    mt_kt = filter_table_to_csqs(mt_kt, all_coding_minus_splice)

    # To avoid double counting transcripts at a given base, key by transcript and position and dedup
    mt_kt = mt_kt.key_by(mt_kt.locus,
                         mt_kt.vep.transcript_consequences.transcript_id)
    mt_kt = mt_kt.distinct()

    # Annotate mt with the gtex values (ie. join them)
    mt_kt = mt_kt.annotate(
        tx_data=gtex_table[mt_kt.vep.transcript_consequences.transcript_id])

    ## Group by gene, symbol and position
    ht_sum_of_bases = mt_kt.group_by(
        locus=mt_kt.locus,
        ensg=mt_kt.vep.transcript_consequences.gene_id,
        symbol=mt_kt.vep.transcript_consequences.gene_symbol).aggregate(
            sum_per_base=hl.agg.array_sum(mt_kt.tx_data.agg_expression))

    tissue_ids = sorted([
        y.tissue.replace("-", "_").replace(" ",
                                           "_").replace("(",
                                                        "_").replace(")", "_")
        for y in gtex.values.take(1)[0]
    ])
    d = {tiss: i for i, tiss in enumerate(tissue_ids)}

    ht_sum_of_bases = ht_sum_of_bases.annotate(**{
        tissue: ht_sum_of_bases.sum_per_base[d[tissue]]
        for tissue in tissue_ids
    })

    if get_proportions:
        gene_maximums_ht = hl.read_table(gene_maximums_ht_path)
        ht_sum_of_bases = ht_sum_of_bases.key_by(ht_sum_of_bases.locus)
        ht_sum_of_bases = ht_sum_of_bases.annotate(alleles="filler")
        ht_sum_of_bases = get_expression_proportion(
            tx_table=ht_sum_of_bases,
            tissues_to_filter=["sum_per_base"],
            gene_maximum_ht=gene_maximums_ht)
        ht_sum_of_bases = ht_sum_of_bases.key_by(ht_sum_of_bases.locus)
        ht_sum_of_bases = ht_sum_of_bases.drop(ht_sum_of_bases.alleles)

    return ht_sum_of_bases
    mt = mt_split.annotate_rows(
        Variant_Type=hl.cond((hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP",
                             hl.cond(
            hl.is_insertion(
                mt_split.alleles[0], mt_split.alleles[1]),
            "INDEL",
            hl.cond(hl.is_deletion(mt_split.alleles[0],
                                   mt_split.alleles[1]), "INDEL",
                    "Other"))))

    mt = mt.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_cohorts.mt",  overwrite=True)
    print("Finished splitting and writing mt. ")

    intersection_table = hl.import_bed(
        intersection_bed, reference_genome='GRCh38')
    
    union_table = hl.import_bed(union_bed, reference_genome='GRCh38')
    
    mt_intersection = mt.filter_rows(
        hl.is_defined(intersection_table[mt.locus]))
    mt_union = mt.filter_rows(hl.is_defined(union_table[mt.locus]))

    mt_intersection = hl.sample_qc(mt_intersection, name='sample_QC_Hail')
    pandadf1 = mt_intersection.cols().flatten()
    print("Outputting table of sample qc")
    pandadf1.export(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_intersection_BED_sampleQC.tsv.bgz", header=True)

    mt_intersection = mt_intersection.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-intersection_BED.mt", overwrite=True)
Esempio n. 21
0
                         impute=True,
                         types={
                             'f0': hl.tstr
                         }).key_by('f0'))
vds = vds.annotate_cols(**table[vds.s])
# import covar
# dic = {}
# for i in np.arange(1, 41):
#     dic['pc' + str(i)] = hl.tfloat
# pcas = hl.import_table('gs://ukb_testdata/data/covar.txt', delimiter=' ', types=dic).key_by('FID')
# pcas = pcas.drop('IID')
# vds = vds.annotate_cols(**pcas[vds.s])
# vds.select_cols('f1')

print("current time is: ", time.asctime(time.localtime(time.time())))
bed = hl.import_bed('gs://ukb_testdata/data/Berisa.EUR.hg19_modif.bed')
print("current time is: ", time.asctime(time.localtime(time.time())))

vds = vds.annotate_rows(LD_block=bed[vds.locus].target)

gts_as_rows = vds.annotate_rows(
    mean=hl.agg.mean(hl.float(vds.GT.n_alt_alleles())),
    genotypes=hl.agg.collect(hl.float(vds.GT.n_alt_alleles())),
    phenotypes=hl.agg.collect(hl.float(vds.f1))).rows()

groups = gts_as_rows.group_by(ld_block=gts_as_rows.LD_block).aggregate(
    genotypes=hl.agg.collect(gts_as_rows.genotypes),
    ys=hl.agg.collect(gts_as_rows.phenotypes))

df = groups.to_spark()
Esempio n. 22
0
 def test_import_bed_no_reference_specified(self):
     bed_file = resource('example1.bed')
     t = hl.import_bed(bed_file, reference_genome=None)
     self.assertEqual(t.interval.dtype.point_type, hl.tstruct(contig=hl.tstr, position=hl.tint32))
def main(args):

    bed_to_exclude_pca = hl.import_bed(locations_exclude_from_pca,
                                       reference_genome='GRCh38')
    cohorts_pop = hl.import_table(cohorts_populations,
                                  delimiter="\t").key_by('s')

    # # overlap AKT dataset
    overlap_1kg_AKT = hl.import_matrix_table(AKT_overlap)
    # drop cohorts
    # annotate with cohorts and populations from s3 table.
    # save matrixtable
    mt = hl.read_matrix_table(args.matrixtable)
    mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort)
    mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population)
    mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated)
    # mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation)
    mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)
    mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt",
        overwrite=True)
    # filter matrixtable
    logger.info("wrote mt ")
    # filter mt
    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1]))
    mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail')
    # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm.
    mt_vqc_filtered = mt_vqc.filter_rows(
        (mt_vqc.variant_QC_Hail.call_rate >= 0.99)
        & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05)
        & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95))
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined(
        bed_to_exclude_pca[mt_vqc_filtered.locus]),
                                                  keep=False)
    # overlap AKT dataset:
    # overlap_1kg_AKT
    # mt_1kg_chr1_chr20 = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt")
    overlap_1kg_AKT = overlap_1kg_AKT.key_rows_by("locus")
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(
        hl.is_defined(overlap_1kg_AKT.rows()[mt_vqc_filtered.locus]))
    logger.info("done filtering writing mt")
    # ld pruning
    pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000)
    #pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt_vqc_filtered.filter_rows(
        hl.is_defined(pruned_ht[mt_vqc_filtered.row_key]))
    # remove pruned areas that need to be removed

    # autosomes only:
    pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome())

    pruned_mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt",
        overwrite=True)
    # pruned_mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt")

    # related_samples_to_drop = hl.read_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht")

    logger.info("run_pca_with_relateds")
    # pca_evals, pca_scores, pca_loadings = run_pca_with_relateds(
    #    pruned_mt, related_samples_to_drop, autosomes_only=True)
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        pruned_mt.GT, k=10, compute_loadings=True)
    pca_scores = pca_scores.annotate(
        known_pop=pruned_mt.cols()[pca_scores.s].known_pop)
    # mt = mt.annotate_cols(
    #    loadings=pca_loadings[mt_vqc_filtered.col_key].loadings)
    # mt = mt.annotate_cols(known_pop="unk")
    # pca_scores = pca_scores.annotate(known_pop="unk")
    pca_scores.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_scores_after_pruning.ht",
        overwrite=True)
    pca_loadings.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_loadings_after_pruning.ht",
        overwrite=True)
    with open(f"{args.output_dir}/ddd-elgh-ukbb/pca_evals_after_pruning.txt",
              'w') as f:
        for val in pca_evals:
            f.write(str(val))

    logger.info("assign population pcs")

    pop_ht, pop_clf = assign_population_pcs(pca_scores,
                                            pca_scores.scores,
                                            known_col="known_pop",
                                            n_estimators=100,
                                            prop_train=0.8,
                                            min_prob=0.5)
    pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.ht",
                 overwrite=True)
    pop_ht.export(
        f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.tsv.gz")