def main(args): hl.init(log='/liftover.log') if args.gnomad: gnomad = True path = None if args.exomes: data_type = 'exomes' if args.genomes: data_type = 'genomes' logger.info('Working on gnomAD {} release ht'.format(data_type)) logger.info('Reading in release ht') t = public_release(data_type).ht() logger.info('Variants in release ht: {}'.format(t.count())) else: data_type = None gnomad = False if args.ht: path = args.ht t = hl.read_table(args.ht) if args.mt: path = args.mt t = hl.read_matrix_table(args.mt) logger.info('Checking if input data has been split') if 'was_split' not in t.row: t = hl.split_multi(t) if isinstance( t, hl.Table) else hl.split_multi_hts(t) logger.info('Preparing reference genomes for liftover') source, target = get_liftover_genome(t) if args.test: logger.info('Filtering to chr21 for testing') if source.name == 'GRCh38': contig = 'chr21' else: contig = '21' t = hl.filter_intervals( t, [hl.parse_locus_interval(contig, reference_genome=source.name)]) logger.info(f'Lifting data to {target.name}') t = lift_data(t, gnomad, data_type, path, target, args.overwrite) logger.info('Checking SNPs for reference mismatches') t = annotate_snp_mismatch(t, data_type, target) mismatch = check_mismatch(t) if isinstance( t, hl.Table) else check_mismatch(t.rows()) logger.info('{} total SNPs'.format(mismatch['total_variants'])) logger.info('{} SNPs on minus strand'.format(mismatch['negative_strand'])) logger.info('{} reference mismatches in SNPs'.format( mismatch['total_mismatch'])) logger.info('{} mismatches on minus strand'.format( mismatch['negative_strand_mismatch']))
def _import_dbsnp(**kwargs) -> hl.Table: dbsnp = import_sites_vcf(**kwargs) # Note: permit_shuffle is set because the dbsnp vcf has duplicate loci (turned into a set) so might be out of order dbsnp = hl.split_multi(dbsnp, permit_shuffle=True) dbsnp = dbsnp.group_by( dbsnp.locus, dbsnp.alleles).aggregate(rsid=hl.agg.collect_as_set(dbsnp.rsid)) return dbsnp
def split_info() -> hl.Table: """ Generates an info table that splits multi-allelic sites from the multi-allelic info table. :return: Info table with split multi-allelics :rtype: Table """ info_ht = get_info(split=False).ht() # Create split version info_ht = hl.split_multi(info_ht) info_ht = info_ht.annotate( info=info_ht.info.annotate( **split_info_annotation(info_ht.info, info_ht.a_index), ), AS_lowqual=split_lowqual_annotation(info_ht.AS_lowqual, info_ht.a_index), ) return info_ht
import hail as hl ht_samples = hl.read_table( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_samples.ht') ht_relationships = hl.read_table( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_sample_relationships.ht') mt = hl.import_vcf( 'gs://hail-datasets-raw-data/1000_Genomes/1000_Genomes_phase3_chrMT_GRCh37.vcf.bgz', reference_genome='GRCh37') mt = mt.annotate_cols(**ht_samples[mt.s]) mt = mt.annotate_cols(**ht_relationships[mt.s]) mt_split = hl.split_multi(mt) mt_split = mt_split.select_entries( GT=hl.downcode(mt_split.GT, mt_split.a_index)) mt_split = mt_split.annotate_rows(info=hl.struct( AC=mt_split.info.AC[mt_split.a_index - 1], VT=(hl.case().when((mt_split.alleles[0].length() == 1) & ( mt_split.alleles[1].length() == 1), 'SNP').when( mt_split.alleles[0].matches('<CN*>') | mt_split.alleles[1].matches('<CN*>'), 'SV').default('INDEL')))) n_rows, n_cols = mt_split.count() n_partitions = mt_split.n_partitions() mt_split = hl.sample_qc(mt_split) mt_split = hl.variant_qc(mt_split) mt_split = mt_split.annotate_globals(
def split_multi(): mt = hl.read_matrix_table(resource('profile.mt')) hl.split_multi(mt)._force_count_rows()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--input-url", help="URL of ExAC sites VCF", default= "gs://gnomad-public/legacy/exac_browser/ExAC.r1.sites.vep.vcf.gz") parser.add_argument("--output-url", help="URL to write Hail table to", required=True) parser.add_argument("--subset", help="Filter variants to this chrom:start-end range") args = parser.parse_args() hl.init(log="/tmp/hail.log") print("\n=== Importing VCF ===") ds = hl.import_vcf(args.input_url, force_bgz=True, min_partitions=2000, skip_invalid_loci=True).rows() if args.subset: print(f"\n=== Filtering to interval {args.subset} ===") subset_interval = hl.parse_locus_interval(args.subset) ds = ds.filter(subset_interval.contains(ds.locus)) print("\n=== Splitting multiallelic variants ===") ds = hl.split_multi(ds) ds = ds.repartition(2000, shuffle=True) # Get value corresponding to the split variant ds = ds.annotate(info=ds.info.annotate( **{ field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][ ds.a_index - 1]) for field in PER_ALLELE_FIELDS })) # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals, # which is the same in each alt allele's variant. ds = ds.annotate(info=ds.info.annotate( DP_HIST=hl.struct(all=ds.info.DP_HIST[0], alt=ds.info.DP_HIST[ds.a_index]), GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0], alt=ds.info.GQ_HIST[ds.a_index]), )) ds = ds.cache() print("\n=== Munging data ===") # Convert "NA" and empty strings into null values # Convert fields in chunks to avoid "Method code too large" errors for i in range(0, len(SELECT_INFO_FIELDS), 10): ds = ds.annotate(info=ds.info.annotate( **{ field: hl.or_missing( hl.is_defined(ds.info[field]), hl.bind( lambda value: hl.cond( (value == "") | (value == "NA"), hl.null(ds.info[field].dtype), ds.info[field]), hl.str(ds.info[field]), ), ) for field in SELECT_INFO_FIELDS[i:i + 10] })) # Convert field types ds = ds.annotate(info=ds.info.annotate( **{ field: hl.cond(ds.info[field] == "", hl.null(hl.tint), hl.int(ds.info[field])) for field in CONVERT_TO_INT_FIELDS })) ds = ds.annotate(info=ds.info.annotate( **{ field: hl.cond(ds.info[field] == "", hl.null(hl.tfloat), hl.float(ds.info[field])) for field in CONVERT_TO_FLOAT_FIELDS })) # Format VEP annotations to mimic the output of hail.vep ds = ds.annotate(info=ds.info.annotate(CSQ=ds.info.CSQ.map( lambda s: s.replace("%3A", ":").replace("%3B", ";").replace( "%3D", "=").replace("%25", "%").replace("%2C", ",")))) ds = ds.annotate(vep=hl.struct( transcript_consequences=ds.info.CSQ.map(lambda csq_str: hl.bind( lambda csq_values: hl.struct( **{ field: hl.cond(csq_values[index] == "", hl.null(hl.tstr), csq_values[index]) for index, field in enumerate(VEP_FIELDS) }), csq_str.split("\\|"), )).filter(lambda annotation: annotation.Feature.startswith("ENST")). filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index). map(lambda annotation: annotation.select( amino_acids=annotation.Amino_acids, biotype=annotation.BIOTYPE, canonical=annotation.CANONICAL == "YES", # cDNA_position may contain either "start-end" or, when start == end, "start" cdna_start=split_position_start(annotation.cDNA_position), cdna_end=split_position_end(annotation.cDNA_position), codons=annotation.Codons, consequence_terms=annotation.Consequence.split("&"), distance=hl.int(annotation.DISTANCE), domains=hl.or_missing( hl.is_defined(annotation.DOMAINS), annotation.DOMAINS.split("&").map(lambda d: hl.struct( db=d.split(":")[0], name=d.split(":")[1])), ), exon=annotation.EXON, gene_id=annotation.Gene, gene_symbol=annotation.SYMBOL, gene_symbol_source=annotation.SYMBOL_SOURCE, hgnc_id=annotation.HGNC_ID, hgvsc=annotation.HGVSc, hgvsp=annotation.HGVSp, lof=annotation.LoF, lof_filter=annotation.LoF_filter, lof_flags=annotation.LoF_flags, lof_info=annotation.LoF_info, # PolyPhen field contains "polyphen_prediction(polyphen_score)" polyphen_prediction=hl.or_missing( hl.is_defined(annotation.PolyPhen), annotation.PolyPhen.split("\\(")[0]), protein_id=annotation.ENSP, # Protein_position may contain either "start-end" or, when start == end, "start" protein_start=split_position_start(annotation.Protein_position), protein_end=split_position_end(annotation.Protein_position), # SIFT field contains "sift_prediction(sift_score)" sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT), annotation.SIFT.split("\\(")[0]), transcript_id=annotation.Feature, )))) ds = ds.annotate(vep=ds.vep.annotate(most_severe_consequence=hl.bind( lambda all_consequence_terms: hl.or_missing( all_consequence_terms.size() != 0, hl.sorted(all_consequence_terms, key=consequence_term_rank)[0]), ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms), ))) ds = ds.cache() print("\n=== Adding derived fields ===") ds = ds.annotate( sorted_transcript_consequences=sorted_transcript_consequences_v3( ds.vep)) ds = ds.select( "filters", "qual", "rsid", "sorted_transcript_consequences", AC=ds.info.AC, AC_Adj=ds.info.AC_Adj, AC_Hemi=ds.info.AC_Hemi, AC_Hom=ds.info.AC_Hom, AF=ds.info.AF, AN=ds.info.AN, AN_Adj=ds.info.AN_Adj, BaseQRankSum=ds.info.BaseQRankSum, CCC=ds.info.CCC, ClippingRankSum=ds.info.ClippingRankSum, DB=ds.info.DB, DP=ds.info.DP, DS=ds.info.DS, END=ds.info.END, FS=ds.info.FS, GQ_MEAN=ds.info.GQ_MEAN, GQ_STDDEV=ds.info.GQ_STDDEV, HWP=ds.info.HWP, HaplotypeScore=ds.info.HaplotypeScore, InbreedingCoeff=ds.info.InbreedingCoeff, MLEAC=ds.info.MLEAC, MLEAF=ds.info.MLEAF, MQ=ds.info.MQ, MQ0=ds.info.MQ0, MQRankSum=ds.info.MQRankSum, NCC=ds.info.NCC, NEGATIVE_TRAIN_SITE=ds.info.NEGATIVE_TRAIN_SITE, POSITIVE_TRAIN_SITE=ds.info.POSITIVE_TRAIN_SITE, QD=ds.info.QD, ReadPosRankSum=ds.info.ReadPosRankSum, VQSLOD=ds.info.VQSLOD, culprit=ds.info.culprit, DP_HIST=ds.info.DP_HIST, GQ_HIST=ds.info.GQ_HIST, DOUBLETON_DIST=ds.info.DOUBLETON_DIST, AC_CONSANGUINEOUS=ds.info.AC_CONSANGUINEOUS, AN_CONSANGUINEOUS=ds.info.AN_CONSANGUINEOUS, Hom_CONSANGUINEOUS=ds.info.Hom_CONSANGUINEOUS, AGE_HISTOGRAM_HET=ds.info.AGE_HISTOGRAM_HET, AGE_HISTOGRAM_HOM=ds.info.AGE_HISTOGRAM_HOM, AC_POPMAX=ds.info.AC_POPMAX, AN_POPMAX=ds.info.AN_POPMAX, POPMAX=ds.info.POPMAX, K1_RUN=ds.info.K1_RUN, K2_RUN=ds.info.K2_RUN, K3_RUN=ds.info.K3_RUN, ESP_AF_POPMAX=ds.info.ESP_AF_POPMAX, ESP_AF_GLOBAL=ds.info.ESP_AF_GLOBAL, ESP_AC=ds.info.ESP_AC, KG_AF_POPMAX=ds.info.KG_AF_POPMAX, KG_AF_GLOBAL=ds.info.KG_AF_GLOBAL, KG_AC=ds.info.KG_AC, AC_FEMALE=ds.info.AC_FEMALE, AN_FEMALE=ds.info.AN_FEMALE, AC_MALE=ds.info.AC_MALE, AN_MALE=ds.info.AN_MALE, populations=hl.struct( **{ pop_id: hl.struct( AC=ds.info[f"AC_{pop_id}"], AN=ds.info[f"AN_{pop_id}"], hemi=ds.info[f"Hemi_{pop_id}"], hom=ds.info[f"Hom_{pop_id}"], ) for pop_id in ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"] }), colocated_variants=hl.bind( lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles). filter(lambda v_id: v_id != this_variant_id), variant_id(ds.locus, ds.alleles), ), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus), ) print("\n=== Writing table ===") ds.write(args.output_url)
def split_multi(mt_path): mt = hl.read_matrix_table(mt_path) hl.split_multi(mt)._force_count_rows()
def _get_filtered_mt(self, rsid='rs35471880'): mt = hl.import_vcf('tests/data/1kg_30variants.vcf.bgz') mt = hl.split_multi(mt.filter_rows(mt.rsid == rsid)) return mt
def import_exac_vcf(path): ds = hl.import_vcf(path, force_bgz=True, skip_invalid_loci=True).rows() ds = hl.split_multi(ds) ds = ds.repartition(5000, shuffle=True) # Get value corresponding to the split variant ds = ds.annotate( info=ds.info.annotate( **{ field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][ds.a_index - 1]) for field in PER_ALLELE_FIELDS } ) ) # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals, # which is the same in each alt allele's variant. ds = ds.annotate( info=ds.info.annotate( DP_HIST=hl.struct(all=ds.info.DP_HIST[0], alt=ds.info.DP_HIST[ds.a_index]), GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0], alt=ds.info.GQ_HIST[ds.a_index]), ) ) ds = ds.cache() # Convert "NA" and empty strings into null values # Convert fields in chunks to avoid "Method code too large" errors for i in range(0, len(SELECT_INFO_FIELDS), 10): ds = ds.annotate( info=ds.info.annotate( **{ field: hl.or_missing( hl.is_defined(ds.info[field]), hl.if_else( (hl.str(ds.info[field]) == "") | (hl.str(ds.info[field]) == "NA"), hl.null(ds.info[field].dtype), ds.info[field], ), ) for field in SELECT_INFO_FIELDS[i : i + 10] } ) ) # Convert field types ds = ds.annotate( info=ds.info.annotate( **{ field: hl.if_else(ds.info[field] == "", hl.null(hl.tint), hl.int(ds.info[field])) for field in CONVERT_TO_INT_FIELDS } ) ) ds = ds.annotate( info=ds.info.annotate( **{ field: hl.if_else(ds.info[field] == "", hl.null(hl.tfloat), hl.float(ds.info[field])) for field in CONVERT_TO_FLOAT_FIELDS } ) ) # Format VEP annotations to mimic the output of hail.vep ds = ds.annotate( info=ds.info.annotate( CSQ=ds.info.CSQ.map( lambda s: s.replace("%3A", ":") .replace("%3B", ";") .replace("%3D", "=") .replace("%25", "%") .replace("%2C", ",") ) ) ) ds = ds.annotate( vep=hl.struct( transcript_consequences=ds.info.CSQ.map( lambda csq_str: hl.bind( lambda csq_values: hl.struct( **{ field: hl.if_else(csq_values[index] == "", hl.null(hl.tstr), csq_values[index]) for index, field in enumerate(VEP_FIELDS) } ), csq_str.split(r"\|"), ) ) .filter(lambda annotation: annotation.Feature.startswith("ENST")) .filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index) .map( lambda annotation: annotation.select( amino_acids=annotation.Amino_acids, biotype=annotation.BIOTYPE, canonical=annotation.CANONICAL == "YES", # cDNA_position may contain either "start-end" or, when start == end, "start" cdna_start=split_position_start(annotation.cDNA_position), cdna_end=split_position_end(annotation.cDNA_position), codons=annotation.Codons, consequence_terms=annotation.Consequence.split("&"), distance=hl.int(annotation.DISTANCE), domains=hl.or_missing( hl.is_defined(annotation.DOMAINS), annotation.DOMAINS.split("&").map( lambda d: hl.struct(db=d.split(":")[0], name=d.split(":")[1]) ), ), exon=annotation.EXON, gene_id=annotation.Gene, gene_symbol=annotation.SYMBOL, gene_symbol_source=annotation.SYMBOL_SOURCE, hgnc_id=annotation.HGNC_ID, hgvsc=annotation.HGVSc, hgvsp=annotation.HGVSp, lof=annotation.LoF, lof_filter=annotation.LoF_filter, lof_flags=annotation.LoF_flags, lof_info=annotation.LoF_info, # PolyPhen field contains "polyphen_prediction(polyphen_score)" polyphen_prediction=hl.or_missing( hl.is_defined(annotation.PolyPhen), annotation.PolyPhen.split(r"\(")[0] ), protein_id=annotation.ENSP, # Protein_position may contain either "start-end" or, when start == end, "start" protein_start=split_position_start(annotation.Protein_position), protein_end=split_position_end(annotation.Protein_position), # SIFT field contains "sift_prediction(sift_score)" sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT), annotation.SIFT.split(r"\(")[0]), transcript_id=annotation.Feature, ) ) ) ) ds = ds.annotate( vep=ds.vep.annotate( most_severe_consequence=hl.bind( lambda all_consequence_terms: hl.or_missing( all_consequence_terms.size() != 0, hl.sorted(all_consequence_terms, key=consequence_term_rank)[0] ), ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms), ) ) ) ds = ds.cache() QUALITY_METRIC_HISTOGRAM_BIN_EDGES = [i * 5 for i in range(21)] ds = ds.select( variant_id=variant_id(ds.locus, ds.alleles), reference_genome="GRCh37", chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, xpos=x_position(ds.locus), ref=ds.alleles[0], alt=ds.alleles[1], rsid=ds.rsid, exome=hl.struct( ac=ds.info.AC_Adj, an=ds.info.AN_Adj, homozygote_count=ds.info.AC_Hom, hemizygote_count=hl.or_else(ds.info.AC_Hemi, 0), filters=hl.set(hl.if_else(ds.info.AC_Adj == 0, ds.filters.add("AC0"), ds.filters)), populations=[ hl.struct( id=pop_id, ac=ds.info[f"AC_{pop_id}"], an=ds.info[f"AN_{pop_id}"], hemizygote_count=hl.or_else(ds.info[f"Hemi_{pop_id}"], 0), homozygote_count=ds.info[f"Hom_{pop_id}"], ) for pop_id in ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"] ], age_distribution=hl.struct( het=hl.rbind( hl.or_else(ds.info.AGE_HISTOGRAM_HET, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float), lambda bins: hl.struct( bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80], bin_freq=bins[1:11], n_smaller=bins[0], n_larger=bins[11], ), ), hom=hl.rbind( hl.or_else(ds.info.AGE_HISTOGRAM_HOM, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float), lambda bins: hl.struct( bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80], bin_freq=bins[1:11], n_smaller=bins[0], n_larger=bins[11], ), ), ), quality_metrics=hl.struct( genotype_depth=hl.struct( all=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.DP_HIST.all.split(r"\|").map(hl.float), ), alt=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.DP_HIST.alt.split(r"\|").map(hl.float), ), ), genotype_quality=hl.struct( all=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.GQ_HIST.all.split(r"\|").map(hl.float), ), alt=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.GQ_HIST.alt.split(r"\|").map(hl.float), ), ), site_quality_metrics=[ hl.struct(metric="BaseQRankSum", value=hl.float(ds.info.BaseQRankSum)), hl.struct(metric="ClippingRankSum", value=hl.float(ds.info.ClippingRankSum)), hl.struct(metric="DP", value=hl.float(ds.info.DP)), hl.struct(metric="FS", value=hl.float(ds.info.FS)), hl.struct(metric="InbreedingCoeff", value=hl.float(ds.info.InbreedingCoeff)), hl.struct(metric="MQ", value=hl.float(ds.info.MQ)), hl.struct(metric="MQRankSum", value=hl.float(ds.info.MQRankSum)), hl.struct(metric="QD", value=hl.float(ds.info.QD)), hl.struct(metric="ReadPosRankSum", value=hl.float(ds.info.ReadPosRankSum)), hl.struct(metric="SiteQuality", value=hl.float(ds.qual)), hl.struct(metric="VQSLOD", value=hl.float(ds.info.VQSLOD)), ], ), ), colocated_variants=hl.rbind( variant_id(ds.locus, ds.alleles), lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles).filter( lambda v_id: v_id != this_variant_id ), ), vep=ds.vep, ) ds = ds.annotate(genome=hl.null(ds.exome.dtype)) return ds
print("\n=== Importing VCF ===") mt = hl.import_vcf(args.input_url, force_bgz=True, min_partitions=2000, skip_invalid_loci=True) # Drop entry values mt = mt.drop("AD", "DP", "GQ", "GT", "MIN_DP", "PL", "SB") if args.subset: print(f"\n=== Filtering to interval {args.subset} ===") subset_interval = hl.parse_locus_interval(args.subset) mt = mt.filter_rows(subset_interval.contains(mt.locus)) print("\n=== Splitting multiallelic variants ===") mt = hl.split_multi(mt) # For multiallelic variants, these fields contain a value for each alt allele PER_ALLELE_FIELDS = [ "AC", "AC_Adj", "AC_Hemi", "AC_Hom", "AC_MALE", "AC_FEMALE", "AF", "AC_AFR", "AC_AMR", "AC_EAS", "AC_FIN", "AC_NFE",