def min_rep(locus, ref, alt): mr = hl.min_rep(locus, [ref, alt]) return (hl.case().when( alt == '<NON_REF>', hl.struct(ref=ref[0:1], alt=alt)).when( locus == mr.locus, hl.struct(ref=mr.alleles[0], alt=mr.alleles[1])).or_error( "locus before and after minrep differ"))
def import_vcf(vcf_path: str, genome_version: str, min_partitions: int = None, force_bgz: bool = True, drop_samples: bool = False, skip_invalid_loci: bool = False, split_multi_alleles: bool = True): """Import vcf and return MatrixTable. :param str vcf_path: MT to annotate with VEP :param str genome_version: "37" or "38" :param int min_partitions: min partitions :param bool force_bgz: read .gz as a bgzipped file :param bool drop_samples: if True, discard genotype info :param bool skip_invalid_loci: if True, skip loci that are not consistent with the reference_genome. """ if genome_version not in ("37", "38"): raise ValueError(f"Invalid genome_version: {genome_version}") logger.info(f"\n==> import vcf: {vcf_path}") # add (or remove) "chr" prefix from vcf chroms so they match the reference ref = hl.get_reference(f"GRCh{genome_version}") contig_recoding = { **{ ref_contig.replace("chr", ""): ref_contig for ref_contig in ref.contigs if "chr" in ref_contig }, **{ f"chr{ref_contig}": ref_contig for ref_contig in ref.contigs if "chr" not in ref_contig } } mt = hl.import_vcf(vcf_path, reference_genome=f"GRCh{genome_version}", contig_recoding=contig_recoding, min_partitions=min_partitions, force_bgz=force_bgz, drop_samples=drop_samples, skip_invalid_loci=skip_invalid_loci) mt = mt.annotate_globals(sourceFilePath=vcf_path, genomeVersion=genome_version) mt = mt.annotate_rows(original_alt_alleles=hl.or_missing( hl.len(mt.alleles) > 2, get_expr_for_variant_ids(mt.locus, mt.alleles))) if split_multi_alleles: mt = hl.split_multi_hts(mt) mt = mt.key_rows_by(**hl.min_rep(mt.locus, mt.alleles)) return mt
def import_vcf( vcf_path: Union[str,List[str]], genome_version: str, sample_name: str, # Exomes VCFs can be split to ~1mb chunks for annotation (good for joins) # but they are pretty tiny and I think too big is bad. min_partitions: int = 50, force_bgz: bool = False, drop_samples: bool = False, skip_invalid_loci: bool = False, split_multi_alleles: bool = True): """Import vcf and return MatrixTable. :param str vcf_path: MT to annotate with VEP :param str genome_version: "37" or "38" :param int min_partitions: min partitions :param bool force_bgz: read .gz as a bgzipped file :param bool drop_samples: if True, discard genotype info :param bool skip_invalid_loci: if True, skip loci that are not consistent with the reference_genome. """ if genome_version not in ("37", "38"): raise ValueError(f"Invalid genome_version: {genome_version}") # logger.info(f"\n==> import vcf: {vcf_path}") # add (or remove) "chr" prefix from vcf chroms so they match the reference ref = hl.get_reference(f"GRCh{genome_version}") contig_recoding = { **{ref_contig.replace("chr", ""): ref_contig for ref_contig in ref.contigs if "chr" in ref_contig}, **{f"chr{ref_contig}": ref_contig for ref_contig in ref.contigs if "chr" not in ref_contig}} mt = hl.import_vcf( vcf_path, reference_genome=f"GRCh{genome_version}", contig_recoding=contig_recoding, min_partitions=min_partitions, force_bgz=force_bgz, drop_samples=drop_samples, skip_invalid_loci=skip_invalid_loci, array_elements_required=False) valid_chros = {'18', '14', '17', 'Y', '2', '8', 'X', '22', '16', '21', '3', '6', '10', '5', '13', '7', '1', '11', '19', 'MT', '4', '12', '9', '20', '15'} mt = mt.filter_rows(hl.literal(valid_chros).contains(mt.locus.contig)) mt.write("/vep/tmpck1.mt",overwrite=True) mt = hl.read_matrix_table("/vep/tmpck1.mt") mt = mt.annotate_globals(sourceFilePath=vcf_path, genomeVersion=genome_version) mt = mt.annotate_rows( originalAltAlleles=hl.or_missing(hl.len(mt.alleles) > 2, get_expr_for_variant_ids(mt.locus, mt.alleles)) ) mt = mt.annotate_rows( xpos=get_expr_for_xpos(mt.locus) ) mt = mt.annotate_rows( ref=mt.alleles[0] ) if split_multi_alleles: mt = hl.split_multi_hts(mt) mt = mt.key_rows_by(**hl.min_rep(mt.locus, mt.alleles)) return mt
def struct_from_min_rep(i): return hl.bind( lambda mr: (hl.case().when( ds.locus == mr.locus, hl.struct(locus=ds.locus, alleles=[mr.alleles[0], mr.alleles[1]], a_index=i, was_split=True)). or_error("Found non-left-aligned variant in sparse_split_multi")), hl.min_rep(ds.locus, [ds.alleles[0], ds.alleles[i]]))
def struct_from_min_rep(i): return hl.bind( lambda mr: (hl.case(). when( ds.locus == mr.locus, hl.struct(locus=ds.locus, alleles=[mr.alleles[0], mr.alleles[1]], a_index=i, was_split=True)).when( filter_changed_loci, hl.null( hl.tstruct(locus=ds.locus.dtype, alleles=hl.tarray(hl.tstr), a_index=hl.tint, was_split=hl.tbool))). or_error("Found non-left-aligned variant in sparse_split_multi\n" + "old locus: " + hl.str(ds.locus) + "\n" + "old ref : " + ds.alleles[0] + "\n" + "old alt : " + ds.alleles[ i] + "\n" + "mr locus : " + hl.str( mr.locus) + "\n" + "mr ref : " + mr.alleles[ 0] + "\n" + "mr alt : " + mr.alleles[1])), hl.min_rep(ds.locus, [ds.alleles[0], ds.alleles[i]]))
def test_min_rep(self): # FIXME actually test ds = self.get_dataset() hl.min_rep(ds).count()
def create_cnn_rank_file() -> None: """ Creates a rank file for the CNN data and writes it to its correct location. :return: Nothing :rtype: None """ logger.info("Creating CNN rank file.") if not hl.utils.hadoop_exists( 'gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht/_SUCCESS'): logger.info(f"Importing CNN scores") ht = hl.import_table( 'gs://gnomad/variant_qc/temp/friedman_cnn_scores.tsv.bgz', min_partitions=1000, impute=True) ht.write('gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht', overwrite=True) logger.info('Formatting CNN scores...') ht = hl.read_table('gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht') ht = ht.annotate( alt_alleles=ht.Alt.split(','), was_split=ht.Alt.contains(',')) # This transforms to a list ht = ht.explode('alt_alleles') ht = ht.annotate(locus=hl.locus(hl.str(ht.Contig), ht.Pos)) # Minrep ht = ht.annotate(**hl.min_rep(ht.locus, [ht.Ref, ht.alt_alleles])) ht = ht.annotate(vartype=add_variant_type(ht.alleles)) ht = ht.key_by('locus', 'alleles') # Select relevant annotations and add singleton / n_nonref, was_split gnomad_ht = get_gnomad_annotations('genomes') ht = ht.select( **gnomad_ht[ht.key], variant_type=ht.vartype.variant_type, n_alt_alleles=ht.vartype.n_alt_alleles, score=ht.CNN_1D_Score, ) ht = ht.filter(ht.n_nonref > 0) ht = add_rank(ht, score_expr=-1 * ht.score, subrank_expr={ 'singleton_rank': ht.singleton, 'biallelic_rank': ~ht.was_split, 'biallelic_singleton_rank': ~ht.was_split & ht.singleton, 'adj_rank': ht.ac > 0, 'adj_biallelic_rank': ~ht.was_split & (ht.ac > 0), 'adj_singleton_rank': ht.singleton & (ht.ac > 0), 'adj_biallelic_singleton_rank': ~ht.was_split & ht.singleton & (ht.ac > 0) }) ht.write(score_ranking_path('genomes', 'cnn'), overwrite=True)
import hail as hl hl.init(sc=sc) # Note: all files were moved to HDFS on spark-c002 to speed along imports (now at 18 hours on 100 cores) meta_path = 'gnomad_meta/metadata_import_table_intermediate_oct_09_2017.txt' vcfs_path = 'gnomad_exomes/scattered/*/genotypes.unfiltered.vcf.gz' out_vds_path = 'gnomad.vds' vqsr_file = 'gnomad_meta/ExAC.merged.sites_only.vcf.ICfiltered.recalibrated.vcf.bgz' meta_kt = hl.import_table(meta_path, impute=True).key_by('sample') vqsr_vds = hl.import_vcf(vqsr_file) vds = hl.import_vcf(vcfs_path, call_fields=['GT', 'PGT'], force_bgz=True, header_file='gnomad_exomes/gnomad_exomes_header_fixed.vcf') vds = vds.filter_cols(meta_kt[vds.s].cloudable) vds = vds.annotate_rows(info=vqsr_vds[(vds.locus, vds.alleles), :].info) vds = hl.min_rep(vds, left_aligned=True) vds.write(out_vds_path) # To copy to cloud: # cd /humgen/atgu1/fs03/konradk/gnomad/2.1/hail # hadoop distcp -libjars gcs-connector-latest-hadoop2.jar -conf core-site.xml -m 50 hdfs:///user/konradk/gnomad.vds gs://gnomad/raw/hail-0.2/vds/exomes/gnomad.exomes.vds
def compute_variant_id(alt): return hl.rbind( hl.min_rep(locus, [alleles[0], alt]), lambda min_rep: variant_id( min_rep.locus, min_rep.alleles, max_length))