def test_import_fam(self): fam_file = resource('sample.fam') nfam = hl.import_fam(fam_file).count() i = 0 with open(fam_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nfam, i)
def test_trio_matrix_null_keys(self): ped = hl.Pedigree.read(resource('triomatrix.fam')) ht = hl.import_fam(resource('triomatrix.fam')) mt = hl.import_vcf(resource('triomatrix.vcf')) mt = mt.annotate_cols(fam=ht[mt.s].fam_id) # Make keys all null mt = mt.key_cols_by(s=hl.null(hl.tstr)) tt = hl.trio_matrix(mt, ped, complete_trios=True) self.assertEqual(tt.count_cols(), 0)
def test_trio_matrix(self): """ This test depends on certain properties of the trio matrix VCF and pedigree structure. This test is NOT a valid test if the pedigree includes quads: the trio_matrix method will duplicate the parents appropriately, but the genotypes_table and samples_table orthogonal paths would require another duplication/explode that we haven't written. """ ped = hl.Pedigree.read(resource('triomatrix.fam')) ht = hl.import_fam(resource('triomatrix.fam')) mt = hl.import_vcf(resource('triomatrix.vcf')) mt = mt.annotate_cols(fam=ht[mt.s].fam_id) dads = ht.filter(hl.is_defined(ht.pat_id)) dads = dads.select(dads.pat_id, is_dad=True).key_by('pat_id') moms = ht.filter(hl.is_defined(ht.mat_id)) moms = moms.select(moms.mat_id, is_mom=True).key_by('mat_id') et = (mt.entries() .key_by('s') .join(dads, how='left') .join(moms, how='left')) et = et.annotate(is_dad=hl.is_defined(et.is_dad), is_mom=hl.is_defined(et.is_mom)) et = (et .group_by(et.locus, et.alleles, fam=et.fam) .aggregate(data=hl.agg.collect(hl.struct( role=hl.case().when(et.is_dad, 1).when(et.is_mom, 2).default(0), g=hl.struct(GT=et.GT, AD=et.AD, DP=et.DP, GQ=et.GQ, PL=et.PL))))) et = et.filter(hl.len(et.data) == 3) et = et.select('data').explode('data') tt = hl.trio_matrix(mt, ped, complete_trios=True).entries().key_by('locus', 'alleles') tt = tt.annotate(fam=tt.proband.fam, data=[hl.struct(role=0, g=tt.proband_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')), hl.struct(role=1, g=tt.father_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')), hl.struct(role=2, g=tt.mother_entry.select('GT', 'AD', 'DP', 'GQ', 'PL'))]) tt = tt.select('fam', 'data').explode('data') tt = tt.filter(hl.is_defined(tt.data.g)).key_by('locus', 'alleles', 'fam') self.assertEqual(et.key.dtype, tt.key.dtype) self.assertEqual(et.row.dtype, tt.row.dtype) self.assertTrue(et._same(tt)) # test annotations e_cols = (mt.cols() .join(dads, how='left') .join(moms, how='left')) e_cols = e_cols.annotate(is_dad=hl.is_defined(e_cols.is_dad), is_mom=hl.is_defined(e_cols.is_mom)) e_cols = (e_cols.group_by(fam=e_cols.fam) .aggregate(data=hl.agg.collect(hl.struct(role=hl.case() .when(e_cols.is_dad, 1).when(e_cols.is_mom, 2).default(0), sa=hl.struct(**e_cols.row.select(*mt.col)))))) e_cols = e_cols.filter(hl.len(e_cols.data) == 3).select('data').explode('data') t_cols = hl.trio_matrix(mt, ped, complete_trios=True).cols() t_cols = t_cols.annotate(fam=t_cols.proband.fam, data=[ hl.struct(role=0, sa=t_cols.proband), hl.struct(role=1, sa=t_cols.father), hl.struct(role=2, sa=t_cols.mother)]).key_by('fam').select('data').explode('data') t_cols = t_cols.filter(hl.is_defined(t_cols.data.sa)) self.assertEqual(e_cols.key.dtype, t_cols.key.dtype) self.assertEqual(e_cols.row.dtype, t_cols.row.dtype) self.assertTrue(e_cols._same(t_cols))
def get_gnomad_data(data_type: str, adj: bool = False, split: bool = True, raw: bool = False, non_refs_only: bool = False, hail_version: str = CURRENT_HAIL_VERSION, meta_version: str = None, meta_root: Optional[str] = 'meta', full_meta: bool = False, fam_version: str = CURRENT_FAM, fam_root: str = None, duplicate_mapping_root: str = None, release_samples: bool = False, release_annotations: bool = None) -> hl.MatrixTable: """ Wrapper function to get gnomAD data as VDS. By default, returns split hardcalls (with adj annotated but not filtered) :param str data_type: One of `exomes` or `genomes` :param bool adj: Whether the returned data should be filtered to adj genotypes :param bool split: Whether the dataset should be split (only applies to raw=False) :param bool raw: Whether to return the raw (10T+) data (not recommended: unsplit, and no special consideration on sex chromosomes) :param bool non_refs_only: Whether to return the non-ref-genotype only MT (warning: no special consideration on sex chromosomes) :param str hail_version: One of the HAIL_VERSIONs :param str meta_version: Version of metadata (None for current) :param str meta_root: Where to put metadata. Set to None if no metadata is desired. :param str full_meta: Whether to add all metadata (warning: large) :param str fam_version: Version of metadata (default to current) :param str fam_root: Where to put the pedigree information. Set to None if no pedigree information is desired. :param str duplicate_mapping_root: Where to put the duplicate genome/exome samples ID mapping (default is None -- do not annotate) :param bool release_samples: When set, filters the data to release samples only :param str release_annotations: One of the RELEASES to add variant annotations (into va), or None for no data :return: gnomAD hardcalls dataset with chosen annotations :rtype: MatrixTable """ #from gnomad_hail.utils import filter_to_adj if raw and split: raise DataException( 'No split raw data. Use of hardcalls is recommended.') if non_refs_only: mt = hl.read_matrix_table( get_gnomad_data_path(data_type, split=split, non_refs_only=non_refs_only, hail_version=hail_version)) else: mt = hl.read_matrix_table( get_gnomad_data_path(data_type, hardcalls=not raw, split=split, hail_version=hail_version)) if adj: mt = filter_to_adj(mt) if meta_root: meta_ht = get_gnomad_meta(data_type, meta_version, full_meta=full_meta) mt = mt.annotate_cols(**{meta_root: meta_ht[mt.s]}) if duplicate_mapping_root: dup_ht = hl.import_table( genomes_exomes_duplicate_ids_tsv_path, impute=True, key='exome_id' if data_type == "exomes" else 'genome_id') mt = mt.annotate_cols(**{duplicate_mapping_root: dup_ht[mt.s]}) if fam_root: fam_ht = hl.import_fam(fam_path(data_type, fam_version)) mt = mt.annotate_cols(**{fam_root: fam_ht[mt.s]}) if release_samples: mt = mt.filter_cols(mt.meta.release) if release_annotations: sites_ht = get_gnomad_public_data(data_type, split) mt = mt.select_rows(**sites_ht[mt.row_key]) mt = mt.select_globals( ) # Required since a backward-incompatible change in Hail return mt
def main(args): global output_prefix output_prefix = args.output_dir.rstrip("/") + "/" + splitext( basename(args.input_mt))[0] if args.compute_qc_mt: qc_mt = get_qc_mt(hl.read_matrix_table(args.input_mt)) qc_mt = qc_mt.repartition(n_partitions=200) qc_mt.write(path('qc.mt'), overwrite=args.overwrite) if args.compute_qc_metrics: logger.info("Computing sample QC") mt = filter_to_autosomes(hl.read_matrix_table(args.input_mt)) strats = { 'bi_allelic': bi_allelic_expr(mt), 'multi_allelic': ~bi_allelic_expr(mt) } for strat, filter_expr in strats.items(): strat_sample_qc_ht = hl.sample_qc( mt.filter_rows(filter_expr)).cols() strat_sample_qc_ht.write(path(f'{strat}_sample_qc.ht'), overwrite=args.overwrite) strat_hts = [ hl.read_table(path(f'{strat}_sample_qc.ht')) for strat in strats ] sample_qc_ht = strat_hts.pop() sample_qc_ht = sample_qc_ht.select( sample_qc=merge_sample_qc_expr([sample_qc_ht.sample_qc] + [ strat_hts[i][sample_qc_ht.key].sample_qc for i in range(0, len(strat_hts)) ])) sample_qc_ht.write(path('sample_qc.ht'), overwrite=args.overwrite) if args.compute_callrate_mt: callrate_mt = compute_callrate_mt( hl.read_matrix_table(args.input_mt), hl.import_locus_intervals(exome_calling_intervals_path)) callrate_mt.write(path('callrate.mt'), args.overwrite) if args.run_platform_pca: eigenvalues, scores_ht, loadings_ht = run_platform_pca( hl.read_matrix_table(path('callrate.mt'))) scores_ht.write(path('platform_pca_scores.ht'), overwrite=args.overwrite) loadings_ht.write(path('platform_pca_loadings.ht'), overwrite=args.overwrite) if args.assign_platforms: platform_ht = assign_platform_from_pcs( hl.read_table(path('platform_pca_scores.ht')), hdbscan_min_cluster_size=args.hdbscan_min_cluster_size, hdbscan_min_samples=args.hdbscan_min_samples) platform_ht.write(f'{output_prefix}.platform_pca_results.ht', overwrite=args.overwrite) if args.impute_sex: sex_ht = infer_sex(hl.read_matrix_table(path('qc.mt')), hl.read_matrix_table(args.input_mt), hl.read_table(path('platform_pca_results.ht')), args.male_threshold, args.female_threshold, args.min_male_y_sites_called, args.max_y_female_call_rate, args.min_y_male_call_rate) sex_ht.write(path('sex.ht'), overwrite=args.overwrite) if args.run_pc_relate: logger.info('Running PCA for PC-Relate') qc_mt = hl.read_matrix_table(path('qc.mt')).unfilter_entries() eig, scores, _ = hl.hwe_normalized_pca(qc_mt.GT, k=10, compute_loadings=False) scores.write(path('pruned.pca_scores.ht'), args.overwrite) logger.info('Running PC-Relate') logger.warn( "PC-relate requires SSDs and doesn't work with preemptible workers!" ) scores = hl.read_table(path('pruned.pca_scores.ht')) relatedness_ht = hl.pc_relate(qc_mt.GT, min_individual_maf=0.05, scores_expr=scores[qc_mt.col_key].scores, block_size=4096, min_kinship=args.min_emission_kinship, statistics='all') relatedness_ht.write(path('relatedness.ht'), args.overwrite) if args.filter_dups: logger.info("Filtering duplicate samples") sample_qc_ht = hl.read_table(path('sample_qc.ht')) samples_rankings_ht = sample_qc_ht.select( rank=-1 * sample_qc_ht.sample_qc.dp_stats.mean) dups_ht = filter_duplicate_samples( hl.read_table(path('relatedness.ht')), samples_rankings_ht) dups_ht.write(path('duplicates.ht'), overwrite=args.overwrite) if args.infer_families: logger.info("Inferring families") duplicates_ht = hl.read_table(path('duplicates.ht')) dups_to_remove = duplicates_ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x.s), duplicates_ht.filtered)) ped = infer_families(hl.read_table(path('relatedness.ht')), hl.read_table(path('sex.ht')), dups_to_remove) ped.write(path('pedigree.ped')) if args.filter_related_samples: logger.info("Filtering related samples") related_pairs_ht, related_pairs_tie_breaker = rank_related_samples( hl.read_table(path('relatedness.ht')), hl.read_table(args.meta), hl.read_table(path('sample_qc.ht')), hl.import_fam(path('pedigree.ped'), delimiter="\t")) related_samples_to_drop_ht = hl.maximal_independent_set( related_pairs_ht.i, related_pairs_ht.j, keep=False, tie_breaker=related_pairs_tie_breaker) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by() related_samples_to_drop_ht = related_samples_to_drop_ht.select( **related_samples_to_drop_ht.node) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by('s') related_samples_to_drop_ht.write(path('related_samples_to_drop.ht'), overwrite=args.overwrite) if args.run_pca: logger.info("Running population PCA") pca_evals, pop_pca_scores_ht, pop_pca_loadings_ht = run_pca_with_relateds( hl.read_matrix_table(path('qc.mt')), hl.read_table(path('related_samples_to_drop.ht')), args.n_pcs) pop_pca_loadings_ht.write(path('pop_pca_loadings.ht'), args.overwrite) pop_pca_scores_ht.write(path('pop_pca_scores.ht'), args.overwrite) if args.assign_pops: logger.info("Assigning global population labels") pop_pca_scores_ht = hl.read_table(path("pop_pca_scores.ht")) gnomad_meta_ht = get_gnomad_meta('exomes').select("pop")[ pop_pca_scores_ht.key] pop_pca_scores_ht = pop_pca_scores_ht.annotate(known_pop=hl.or_missing( gnomad_meta_ht.pop != "oth", gnomad_meta_ht.pop)) pop_ht, pops_rf_model = assign_population_pcs( pop_pca_scores_ht, pc_cols=pop_pca_scores_ht.scores[:args.n_pcs], known_col='known_pop', min_prob=args.min_pop_prob) pop_ht.write(path('pop.ht'), args.overwrite) with hl.hadoop_open(path('pop_rf_model.pkl'), 'wb') as out: pickle.dump(pops_rf_model, out) if args.assign_subpops: qc_mt = hl.read_matrix_table(path('qc.mt')) pop_ht = hl.read_table(path('pop.ht')) meta_ht = hl.read_table(args.meta)[qc_mt.col_key] qc_mt = qc_mt.annotate_cols(pop=pop_ht[qc_mt.col_key].pop, is_case=meta_ht.is_case, country=meta_ht.country) platform_specific_intervals = get_platform_specific_intervals( hl.read_table(path('platform_pca_loadings.ht')), threshold=0.01) logger.info( f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.' ) qc_mt = hl.filter_intervals(qc_mt, platform_specific_intervals, keep=False) assign_and_write_subpops( qc_mt, hl.read_table(path('related_samples_to_drop.ht')), min_samples_for_subpop=args.min_samples_for_subpop, n_pcs=args.n_pcs, min_pop_prob=args.min_pop_prob, overwrite=args.overwrite, pop_ann='pop', subpop_ann='country', include_in_pop_count=qc_mt.is_case) if args.run_kgp_pca: logger.info("Joining data with 1000 Genomes") qc_mt = hl.read_matrix_table( path('qc.mt')).select_rows().select_entries("GT") qc_mt = qc_mt.select_cols(known_pop=hl.null(hl.tstr), known_subpop=hl.null(hl.tstr)) qc_mt = qc_mt.key_cols_by(_kgp=False, *qc_mt.col_key) kgp_mt = hl.read_matrix_table( kgp_phase3_genotypes_mt_path()).select_rows() kgp_mt = kgp_mt.select_cols(known_pop=kgp_mt.super_pops.get( kgp_mt.population, "oth").lower(), known_subpop=kgp_mt.population.lower()) kgp_mt = kgp_mt.filter_rows(hl.is_defined( qc_mt.rows()[kgp_mt.row_key])) kgp_mt = filter_rows_for_qc(kgp_mt) kgp_mt = kgp_mt.key_cols_by(_kgp=True, *kgp_mt.col_key) union_kgp_qc_mt = qc_mt.union_cols(kgp_mt) union_kgp_qc_mt.write(path('union_kgp_qc.mt'), overwrite=args.overwrite) logger.info("Computing PCA on data with 1000 Genomes") union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt')) related_samples_to_drop_ht = hl.read_table( path('related_samples_to_drop.ht')) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by( _kgp=False, *related_samples_to_drop_ht.key) pca_evals, union_kgp_pca_scores_ht, union_kgp_pca_loadings_ht = run_pca_with_relateds( union_kgp_qc_mt, related_samples_to_drop_ht, args.n_kgp_pcs) union_kgp_pca_loadings_ht.write(path('union_kgp_pca_loadings.ht'), args.overwrite) union_kgp_pca_scores_ht.write(path('union_kgp_pca_scores.ht'), args.overwrite) if args.assign_pops_kgp: logger.info("Assigning populations based on 1000 Genomes labels") union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt')) union_kgp_pca_scores_ht = hl.read_table( path('union_kgp_pca_scores.ht')) union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.annotate( known_pop=union_kgp_qc_mt[union_kgp_pca_scores_ht.key].known_pop) union_kgp_pop_ht, union_kgp_pop_rf_model = assign_population_pcs( union_kgp_pca_scores_ht, pc_cols=union_kgp_pca_scores_ht.scores[:args.n_kgp_pcs], known_col='known_pop', min_prob=args.min_kgp_pop_prob) union_kgp_pop_ht.write(path('union_kgp_pop.ht'), args.overwrite) with hl.hadoop_open(path('union_kgp_pop_rf_model.pkl'), 'wb') as out: pickle.dump(union_kgp_pop_rf_model, out) if args.assign_subpops_kgp: union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt')) meta_ht = hl.read_table(args.meta) union_kgp_pop_ht = hl.read_table(path('union_kgp_pop.ht')) union_kgp_qc_mt = union_kgp_qc_mt.annotate_cols( is_case=meta_ht[union_kgp_qc_mt.col_key].is_case, pop=union_kgp_pop_ht[union_kgp_qc_mt.col_key].pop) platform_specific_intervals = get_platform_specific_intervals( hl.read_table(path('platform_pca_loadings.ht'))) logger.info( f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.' ) union_kgp_qc_mt = hl.filter_intervals(union_kgp_qc_mt, platform_specific_intervals, keep=False) related_samples_to_drop_ht = hl.read_table( path('related_samples_to_drop.ht')) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by( _kgp=False, *related_samples_to_drop_ht.key) assign_and_write_subpops( union_kgp_qc_mt, related_samples_to_drop_ht, min_samples_for_subpop=args.min_samples_for_subpop, n_pcs=args.n_kgp_pcs, min_pop_prob=args.min_kgp_pop_prob, overwrite=args.overwrite, pop_ann='pop', subpop_ann='known_subpop', include_in_pop_count=union_kgp_qc_mt.is_case, files_prefix='union_kgp_') if args.apply_stratified_filters: logger.info("Computing stratified QC") for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']: sample_qc_ht = hl.read_table( path(f'{variant_class_prefix}sample_qc.ht')) pop_ht = hl.read_table(path('pops.ht')) platform_ht = hl.read_table(path('platform_pca_results.ht')) sample_qc_ht = sample_qc_ht.annotate( qc_pop=pop_ht[sample_qc_ht.key].pop, qc_platform=platform_ht[sample_qc_ht.key].qc_platform) stratified_metrics_ht = compute_stratified_metrics_filter( sample_qc_ht, args.filtering_qc_metrics.split(","), ['qc_pop', 'qc_platform']) stratified_metrics_ht.write( path(f'{variant_class_prefix}stratified_metrics_filters.ht'), overwrite=args.overwrite) if args.write_full_meta: logger.info("Writing metadata table") # List all tables to join with the base meta meta_annotation_hts = [ hl.read_table(path('platform_pca_results.ht')).rename( {'scores': 'platform_pc_scores'}), hl.read_table(path('sex.ht')), flatten_duplicate_samples_ht(hl.read_table(path('duplicates.ht'))), hl.read_table(path('related_samples_to_drop.ht')).select( related_filtered=True), hl.read_table(path('pca_scores.ht')).rename( {'scores': 'pop_pc_scores'}), hl.read_table(path('pops.ht')).select('pop'), hl.read_table(path('nfe.pca_scores.ht')).rename( {'scores': 'nfe_pc_scores'}), hl.read_table(path('subpops.nfe.ht')).select('subpop') ] # union_kgp_pops_ht = hl.read_table(path('union_kgp_pops.ht')) # union_kgp_pops_ht = union_kgp_pops_ht.filter(~union_kgp_pops_ht._kgp).key_by('s') # union_kgp_pops_ht = union_kgp_pops_ht.select(kgp_pop=union_kgp_pops_ht.pop) # meta_annotation_hts.append(union_kgp_pops_ht) # # union_kgp_pca_scores_ht = hl.read_table(path('union_kgp_pca_scores.ht')).rename({'scores': 'kgp_pop_pc_scores'}) # union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.filter(~union_kgp_pca_scores_ht._kgp).key_by('s') # meta_annotation_hts.append(union_kgp_pca_scores_ht) gnomad_meta_ht = get_gnomad_meta('exomes') gnomad_meta_ht = gnomad_meta_ht.select( gnomad_pop=gnomad_meta_ht.pop, gnomad_subpop=gnomad_meta_ht.subpop) meta_annotation_hts.append(gnomad_meta_ht) for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']: sample_qc_ht = hl.read_table( path(f'{variant_class_prefix}sample_qc.ht')) stratified_metrics_filters_ht = hl.read_table( path(f'{variant_class_prefix}stratified_metrics_filters.ht')) if variant_class_prefix: sample_qc_ht = sample_qc_ht.rename( {'sample_qc': f'{variant_class_prefix}sample_qc'}) stratified_metrics_filters_ht = stratified_metrics_filters_ht.rename( { f: f'{variant_class_prefix}{f}' for f in list(stratified_metrics_filters_ht.globals) + list(stratified_metrics_filters_ht.row_value) }) meta_annotation_hts.extend( [sample_qc_ht, stratified_metrics_filters_ht]) meta_ht = hl.read_table(args.meta) meta_ht = meta_ht.annotate_globals( **{ name: expr for ann_ht in meta_annotation_hts for name, expr in ann_ht.index_globals().items() }) meta_ht = meta_ht.annotate( **{ name: expr for ann_ht in meta_annotation_hts for name, expr in ann_ht[meta_ht.key].items() }) filtering_col_prefix = '' if args.filtering_variant_class == 'all' else args.filtering_variant_class + "_" meta_ht = meta_ht.annotate_globals( filtering_variant_class=args.filtering_variant_class) meta_ht = meta_ht.annotate(sample_filters=add_filters_expr( filters={ "ambiguous sex": hl.is_missing(meta_ht.is_female), 'call_rate': meta_ht.sample_qc.call_rate < args.min_call_rate, 'duplicate': hl.is_defined(meta_ht.dup_filtered) & meta_ht.dup_filtered, 'related': meta_ht.related_filtered }, current_filters=meta_ht[ f'{filtering_col_prefix}pop_platform_filters'])) meta_ht.write(path('full_meta.ht'), overwrite=args.overwrite)
def pbt_phased_trios_mt_path(data_type: str, split: bool = True, hail_version: str = CURRENT_HAIL_VERSION): return "gs://gnomad/hardcalls/hail-{0}/mt/{1}/gnomad.{1}.trios.pbt_phased{2}.mt".format( hail_version, data_type, "" if split else ".unsplit") exomes = hl.read_matrix_table(pbt_phased_trios_mt_path("exomes")) exomes = exomes.filter_cols(exomes.s == exomes.source_trio.proband.s) df = phase_sensitivity_fast( exomes, windowsize=100) #should be dealable, for a single individual print("per indv exome done" + tm.ctime()) df["categ"] = df.index hl.Table.from_pandas(df).export( "gs://gnomad-qingbowang/MNV/phase_sensitivity_exome_proband_w100.tsv") genomes = hl.read_matrix_table(pbt_phased_trios_mt_path("genomes")) fam_ht = hl.import_fam(fam_path("genomes"), delimiter="\t") #for genomes, we need to annotate this genomes = genomes.annotate_cols(source_trio=fam_ht[genomes.s]) genomes = genomes.filter_cols( hl.len(genomes.source_trio.fam_id) > 0) #filtering to child only df = phase_sensitivity_fast( genomes, windowsize=100) #should be dealable, for a single individual print("per indv genome done" + tm.ctime()) df["categ"] = df.index hl.Table.from_pandas(df).export( "gs://gnomad-qingbowang/MNV/phase_sensitivity_genome_proband_w100.tsv")