def main(args): full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt') # liftover chains rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) chips = hl.hadoop_open(args.chip_loci) chip_dict = {} for chip in chips: chip = chip.strip().split() chip_pos = hl.import_table(chip[1], filter='\[Controls\]', skip_blank_lines=True) chip_pos = chip_pos.filter( hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains( chip_pos.chr)) chip_pos = chip_pos.key_by( locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos))) # liftover chip position info chip_pos = chip_pos.annotate( new_locus=hl.liftover(chip_pos.locus, 'GRCh38')) chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus)) chip_pos = chip_pos.key_by(locus=chip_pos.new_locus) # filter full vcf to sites in genotype data geno_vcf = full_vcf.filter_rows(hl.is_defined( chip_pos[full_vcf.locus])) hl.export_vcf( geno_vcf, 'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
def plinkify(ds, min=None, max=None): vcf = utils.new_temp_file(prefix="plink", suffix="vcf") plinkpath = utils.new_temp_file(prefix="plink") hl.export_vcf(ds, vcf) threshold_string = "{} {}".format("--min {}".format(min) if min else "", "--max {}".format(max) if max else "") plink_command = "plink --double-id --allow-extra-chr --vcf {} --genome full --out {} {}" \ .format(utils.uri_path(vcf), utils.uri_path(plinkpath), threshold_string) result_file = utils.uri_path(plinkpath + ".genome") syscall(plink_command, shell=True, stdout=DEVNULL, stderr=DEVNULL) ### format of .genome file is: # _, fid1, iid1, fid2, iid2, rt, ez, z0, z1, z2, pihat, phe, # dst, ppc, ratio, ibs0, ibs1, ibs2, homhom, hethet (+ separated) ### format of ibd is: # i (iid1), j (iid2), ibd: {Z0, Z1, Z2, PI_HAT}, ibs0, ibs1, ibs2 results = {} with open(result_file) as f: f.readline() for line in f: row = line.strip().split() results[(row[1], row[3])] = (list(map(float, row[6:10])), list(map(int, row[14:17]))) return results
def main(args): hl.init( default_reference=args.import_build, log="/import_vcf.log", tmp_dir="hdfs:///import_vcf.tmp/", ) logger.info("Importing filters info...") filters_ht = hl.import_vcf( args.part_two_path, reference_genome=args.import_build, force_bgz=True, find_replace=("nul", "."), ).rows() logger.info("Importing VCF...") # NOTE: always assumes file is bgzipped mt = hl.import_vcf( args.part_one_path, force_bgz=True, reference_genome=args.import_build, find_replace=("nul", "."), ) mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters) logger.info(f"MT count: {mt.count()}") hl.export_vcf(mt, args.vcf_out, parallel="header_per_shard")
def export_qced_file(mt: hl.MatrixTable, out_dir: str, basename: str, export_type='hail'): outname = basename + '_qced' if export_type == 'hail': mt.write('{}GWASpy/Preimp_QC/{}.mt'.format(out_dir, outname), overwrite=True) elif export_type == 'plink': hl.export_plink(dataset=mt, output='{}GWASpy/Preimp_QC/{}'.format( out_dir, outname), fam_id=mt.fam_id, ind_id=mt.s, pat_id=mt.pat_id, mat_id=mt.mat_id, is_female=mt.is_female, pheno=mt.is_case, varid=mt.rsid) else: hl.export_vcf(mt, '{}GWASpy/Preimp_QC/{}.vcf.bgz'.format(out_dir, outname))
def test_not_identical_headers(self): t = new_temp_file('vcf') mt = hl.import_vcf(resource('sample.vcf')) hl.export_vcf(mt.filter_cols((mt.s != "C1048::HG02024") & (mt.s != "HG00255")), t) with self.assertRaisesRegex(FatalError, 'invalid sample IDs'): (hl.import_vcf([resource('sample.vcf'), t]) ._force_count_rows())
def test_not_identical_headers(self): t = new_temp_file('vcf') mt = hl.import_vcf(resource('sample.vcf')) hl.export_vcf( mt.filter_cols((mt.s != "C1048::HG02024") & (mt.s != "HG00255")), t) with self.assertRaisesRegex(FatalError, 'invalid sample IDs'): (hl.import_vcf([resource('sample.vcf'), t])._force_count_rows())
def test_export_vcf(self): dataset = hl.import_vcf(resource('sample.vcf.bgz')) vcf_metadata = hl.get_vcf_metadata(resource('sample.vcf.bgz')) hl.export_vcf(dataset, '/tmp/sample.vcf', metadata=vcf_metadata) dataset_imported = hl.import_vcf('/tmp/sample.vcf') self.assertTrue(dataset._same(dataset_imported)) metadata_imported = hl.get_vcf_metadata('/tmp/sample.vcf') self.assertDictEqual(vcf_metadata, metadata_imported)
def main(args): ''' participant_data = args.participant_data sample_map = args.sample_map coverage_mt_path = args.coverage_mt_path vcf_col = args.vcf_col artifact_prone_sites_path = args.artifact_prone_sites_path output_bucket = args.output_bucket file_suffix = args.file_suffix minimum_homref_coverage = args.minimum_homref_coverage chunk_size = args.chunk_size overwrite = args.overwrite ''' input_tsv = args.input_tsv artifact_prone_sites_path = args.artifact_prone_sites_path chunk_size = args.chunk_size overwrite = args.overwrite output_bucket = args.output_bucket file_suffix = args.file_suffix minimum_homref_coverage = args.minimum_homref_coverage coverage_mt_path = args.coverage_mt_path ''' logger.info("Confirming existence of individual sample vcfs...") confirmed_vcfs = check_vcf_existence( participant_data, vcf_col, sample_map, output_bucket ) ''' logger.info("Combining VCFs...") combined_mt = join_mitochondria_vcfs_into_mt(input_tsv, output_bucket, chunk_size) output_path_mt = f"{output_bucket}/raw_combined_mt.mt" combined_mt = combined_mt.checkpoint(output_path_mt, overwrite=overwrite) logger.info("Removing certain FT filters...") combined_mt = remove_FT_values(combined_mt) logger.info("Determining homoplasmic reference sites...") combined_mt = determine_hom_refs(combined_mt, coverage_mt_path, minimum_homref_coverage) logger.info("Applying artifact_prone_site fiter...") combined_mt = apply_mito_artifact_filter(combined_mt, artifact_prone_sites_path) logger.info("Writing combined MT and VCF...") # set the file names for output files out_vcf = f"{output_bucket}/combined_{file_suffix}.vcf.bgz" out_mt = f"{output_bucket}/combined_{file_suffix}.mt" combined_mt.write(out_mt, overwrite=True) hl.export_vcf(combined_mt, out_vcf, metadata=META_DICT) logger.info("DONE")
def main(args): hl.init(master=f'local[{args.n_threads}]', log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'), default_reference=args.reference) sys.path.append('/') add_args = [] if args.additional_args is not None: add_args = args.additional_args.split(',') load_module = importlib.import_module(args.load_module) mt = getattr(load_module, args.load_mt_function)(*add_args) if args.gene_map_ht_path is None: interval = [hl.parse_locus_interval(args.interval)] else: gene_ht = hl.read_table(args.gene_map_ht_path) if args.gene is not None: gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene) interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False) else: interval = [hl.parse_locus_interval(args.interval)] gene_ht = hl.filter_intervals(gene_ht, interval) gene_ht = gene_ht.filter(hl.set(args.groups.split(',')).contains(gene_ht.annotation)) gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t') ).key_by().drop('start').export(args.group_output_file, header=False) # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants` if not args.no_adj: mt = mt.filter_entries(mt.adj) mt = hl.filter_intervals(mt, interval) if not args.input_bgen: mt = mt.select_entries('GT') mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0) mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1]) if args.callrate_filter: mt = mt.filter_rows(hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter) if args.export_bgen: if not args.input_bgen: mt = mt.annotate_entries(GT=hl.if_else(mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT)) mt = gt_to_gp(mt) mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing) hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid) else: mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0))) # Note: no mean-imputation for VCF hl.export_vcf(mt, args.output_file)
def test_export_vcf(self): dataset = hl.import_vcf(resource('sample.vcf.bgz')) vcf_metadata = hl.get_vcf_metadata(resource('sample.vcf.bgz')) hl.export_vcf(dataset, '/tmp/sample.vcf', metadata=vcf_metadata) dataset_imported = hl.import_vcf('/tmp/sample.vcf') self.assertTrue(dataset._same(dataset_imported)) no_sample_dataset = dataset.filter_cols(False).select_entries() hl.export_vcf(no_sample_dataset, '/tmp/no_sample.vcf', metadata=vcf_metadata) no_sample_dataset_imported = hl.import_vcf('/tmp/no_sample.vcf') self.assertTrue(no_sample_dataset._same(no_sample_dataset_imported)) metadata_imported = hl.get_vcf_metadata('/tmp/sample.vcf') self.assertDictEqual(vcf_metadata, metadata_imported)
def main(args): hl.init(default_reference="GRCh38", log="/qc_annotations.log") if args.compute_info: compute_info().write(get_info(split=False).path, overwrite=args.overwrite) if args.split_info: split_info().write(get_info(split=True).path, overwrite=args.overwrite) if args.export_info_vcf: info_ht = get_info(split=False).ht() hl.export_vcf(ht_to_vcf_mt(info_ht), info_vcf_path()) if args.generate_allele_data: mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) generate_allele_data(mt.rows()).write(allele_data.path, overwrite=args.overwrite) if args.generate_ac: # TODO: compute AC and qc_AC as part of compute_info mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) ht = generate_ac(mt).checkpoint( "gs://gnomad-tmp/ac_tmp.ht", overwrite=args.overwrite, _read_if_exists=not args.overwrite, ) ht.repartition(10000, shuffle=False).write(qc_ac.path, overwrite=args.overwrite) if args.generate_fam_stats: mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) fam_stats_ht = generate_fam_stats(mt, trios.path) fam_stats_ht = fam_stats_ht.checkpoint( "gs://gnomad-tmp/fam_stats_tmp.ht", overwrite=args.overwrite, _read_if_exists=not args.overwrite, ) fam_stats_ht = fam_stats_ht.repartition(10000, shuffle=False) fam_stats_ht.write(fam_stats.path, overwrite=args.overwrite) if args.export_transmitted_singletons_vcf: export_transmitted_singletons_vcf() if args.vep: run_vep(vep_version=args.vep_version).write(vep.path, overwrite=args.overwrite)
def main(args): print("main") run_hash = "91b132aa" ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_ranked_denovo_ddd_comp.ht' ) mt = hl.read_matrix_table( f'{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-7and20_split_sampleqc_filtered.mt' ) mt = mt.annotate_rows(Variant_Type=hl.cond( (hl.is_snp(mt.alleles[0], mt.alleles[1])), "SNP", hl.cond( hl.is_insertion(mt.alleles[0], mt.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt.alleles[0], mt.alleles[1]), "INDEL", "Other")))) mt = mt.annotate_rows(info=mt.info.annotate( rf_probability=ht[mt.row_key].rf_probability['TP'])) mt = mt.annotate_rows(info=mt.info.annotate(score=ht[mt.row_key].score)) filter_column_annotation = ( hl.case().when( ((mt.Variant_Type == "SNP") & (mt.info.rf_probability <= 0.90)), "PASS").when(((mt.Variant_Type == "INDEL") & (mt.info.rf_probability <= 0.80)), "PASS").default(".") # remove everything else ) # mt_annotated = mt.annotate_rows(mt.filters=filter_column_annotation) mt1 = mt.annotate_rows(filtercol=((filter_column_annotation))) mt_fail = mt1.filter_rows(mt1.filtercol == ".") print(mt_fail.count()) mt2 = mt1.annotate_rows(filters=mt1.filters.add(mt1.filtercol)) mt_fail2 = mt2.filter_rows(mt2.filters.contains(".")) mt_pass = mt2.filter_rows(mt2.filters.contains("PASS")) print(mt_fail2.count()) print(mt_pass.count()) mt2 = mt2.checkpoint( f'{tmp_dir}/Sanger_cohorts_chr1-7and20_after_RF_final.mt', overwrite=True) hl.export_vcf( mt2, f'{tmp_dir}/Sanger_cohorts_chr1-7and20_after_RF_final.vcf.bgz', parallel='separate_header')
def main(args): tgp_vcf = hl.import_vcf(args.input_vcf, force_bgz=True, header_file='gs://neurogap/reference_data/header', reference_genome='GRCh38', min_partitions=200) sample_info = hl.import_table(args.sample_info).key_by('sample') tgp_vcf = tgp_vcf.annotate_cols(**sample_info[tgp_vcf.s]) afr_vcf = tgp_vcf.filter_cols( (tgp_vcf.super_pop == 'AFR') & ~((tgp_vcf.pop == 'ASW') | (tgp_vcf.pop == 'ACB'))) # NA18498 is missing sample_info.filter(hl.is_missing( tgp_vcf.cols()[sample_info['sample']])).show() hl.export_vcf(afr_vcf, args.output_filename)
def main(args): print("main") run_hash = "91b132aa" ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_ranked_denovo_ddd_comp.ht' ) mt = hl.read_matrix_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_chr1-7and20_after_RF_final.mt' ) hl.export_vcf( mt, f'{tmp_dir}/Sanger_cohorts_chr1-7and20_after_RF_final.vcf.bgz', parallel='separate_header')
def test_export_plink(self): vcf_file = resource('sample.vcf') mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10)) # permute columns so not in alphabetical order! import random indices = list(range(mt.count_cols())) random.shuffle(indices) mt = mt.choose_cols(indices) split_vcf_file = uri_path(new_temp_file()) hl_output = uri_path(new_temp_file()) plink_output = uri_path(new_temp_file()) merge_output = uri_path(new_temp_file()) hl.export_vcf(mt, split_vcf_file) hl.export_plink(mt, hl_output) run_command(["plink", "--vcf", split_vcf_file, "--make-bed", "--out", plink_output, "--const-fid", "--keep-allele-order"]) data = [] with open(uri_path(plink_output + ".bim")) as file: for line in file: row = line.strip().split() row[1] = ":".join([row[0], row[3], row[5], row[4]]) data.append("\t".join(row) + "\n") with open(plink_output + ".bim", 'w') as f: f.writelines(data) run_command(["plink", "--bfile", plink_output, "--bmerge", hl_output, "--merge-mode", "6", "--out", merge_output]) same = True with open(merge_output + ".diff") as f: for line in f: row = line.strip().split() if row != ["SNP", "FID", "IID", "NEW", "OLD"]: same = False break self.assertTrue(same)
def test_impute_sex_same_as_plink(self): import subprocess as sp ds = hl.import_vcf(resource('x-chromosome.vcf')) sex = hl.impute_sex(ds.GT, include_par=True) vcf_file = utils.uri_path(utils.new_temp_file(prefix="plink", suffix="vcf")) out_file = utils.uri_path(utils.new_temp_file(prefix="plink")) hl.export_vcf(ds, vcf_file) try: out = sp.check_output( ["plink", "--vcf", vcf_file, "--const-fid", "--check-sex", "--silent", "--out", out_file], stderr=sp.STDOUT) except sp.CalledProcessError as e: print(e.output) raise e plink_sex = hl.import_table(out_file + '.sexcheck', delimiter=' +', types={'SNPSEX': hl.tint32, 'F': hl.tfloat64}) plink_sex = plink_sex.select('IID', 'SNPSEX', 'F') plink_sex = plink_sex.select( s=plink_sex.IID, is_female=hl.cond(plink_sex.SNPSEX == 2, True, hl.cond(plink_sex.SNPSEX == 1, False, hl.null(hl.tbool))), f_stat=plink_sex.F).key_by('s') sex = sex.select(s=sex.s, is_female=sex.is_female, f_stat=sex.f_stat) self.assertTrue(plink_sex._same(sex.select_globals(), tolerance=1e-3)) ds = ds.annotate_rows(aaf=(agg.call_stats(ds.GT, ds.alleles)).AF[1]) self.assertTrue(hl.impute_sex(ds.GT)._same(hl.impute_sex(ds.GT, aaf='aaf')))
def test_export_plink(self): vcf_file = resource('sample.vcf') mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10)) split_vcf_file = uri_path(new_temp_file()) hl_output = uri_path(new_temp_file()) plink_output = uri_path(new_temp_file()) merge_output = uri_path(new_temp_file()) hl.export_vcf(mt, split_vcf_file) hl.export_plink(mt, hl_output) run_command([ "plink", "--vcf", split_vcf_file, "--make-bed", "--out", plink_output, "--const-fid", "--keep-allele-order" ]) data = [] with open(uri_path(plink_output + ".bim")) as file: for line in file: row = line.strip().split() row[1] = ":".join([row[0], row[3], row[5], row[4]]) data.append("\t".join(row) + "\n") with open(plink_output + ".bim", 'w') as f: f.writelines(data) run_command([ "plink", "--bfile", plink_output, "--bmerge", hl_output, "--merge-mode", "6", "--out", merge_output ]) same = True with open(merge_output + ".diff") as f: for line in f: row = line.strip().split() if row != ["SNP", "FID", "IID", "NEW", "OLD"]: same = False break self.assertTrue(same)
def export_transmitted_singletons_vcf(): """ Exports the transmitted singleton Table to a VCF. :return: None """ qc_ac_ht = qc_ac.ht() for transmission_confidence in ['raw', 'adj']: ts_ht = qc_ac_ht.filter( (fam_stats.ht()[qc_ac_ht.key][f'n_transmitted_{transmission_confidence}'] == 1) & (qc_ac_ht.ac_qc_samples_raw == 2) ) ts_ht = ts_ht.annotate( s=hl.null(hl.tstr) ) ts_mt = ts_ht.to_matrix_table_row_major(columns=['s'], entry_field_name='s') ts_mt = ts_mt.filter_cols(False) hl.export_vcf(ts_mt, get_transmitted_singleton_vcf_path(transmission_confidence), tabix=True)
def main(args): """ Subset joint-called VCF or MT to desired samples. Used when CMG/MGRC collaborators request VCFs of their data. """ hl.init(log="/subset.log", default_reference="GRCh38") if args.vcf_path: logger.info("Importing VCF...") logger.warning("Assuming VCF is bgzipped!") mt = hl.import_vcf(args.vcf_path, force_bgz=True, reference_genome=args.import_build) else: logger.info("Reading in MT...") mt = hl.read_matrix_table(args.mt_path) logger.info(f"Input MT counts: {mt.count()}") mt.describe() if args.mapping: logger.info("Mapping VCF IDs to seqr IDs...") logger.warning("Assuming mapping file the field names s and seqr_id!") mt = remap_sample_ids(mt, args.mapping) logger.info("Subsetting to specified samples and their variants...") mt = subset_samples_and_variants(mt, args.sample_list, header=args.no_header, table_key=args.table_key) logger.info(f"MT counts after subsetting: {mt.count()}") logger.info("Exporting VCF...") if "bgz" not in args.vcf_out: logger.warning( "Path to output VCF does not contain '.bgz'; export might be really slow!" ) hl.export_vcf(mt, args.vcf_out, parallel=args.parallel)
print("importing vds files") vds = hl.read_matrix_table(vds_splitmulti_file) print("removing lcr") lcr = hl.import_bed(lcr_file, reference_genome='GRCh38') vds = vds.filter_rows(hl.is_defined(lcr[vds.locus]), keep=False) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # II. Downcoding # Only keep GT for individual column #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("downcoding...") vds = vds.select_entries(vds.GT) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # III. Write output VCF #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("writing out vcf...") hl.export_vcf(vds, qced_vcf_file) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # print Runtime #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ stop = timeit.default_timer() print("runtime: " + str(stop - start) + " seconds")
def export_vcf(): mt = hl.read_matrix_table(resource('profile.mt')) out = hl.utils.new_temp_file(suffix='vcf.bgz') hl.export_vcf(mt, out)
def prepare_downloads_for_dataset(dataset_id): output_path = pipeline_config.get("output", "staging_path") dataset_prefix = os.path.join(output_path, dataset_id.lower()) output_prefix = os.path.join(output_path, "downloads", dataset_id) gene_results_path = os.path.join(dataset_prefix, "gene_results.ht") gene_results = hl.read_table(gene_results_path) validate_gene_results_table(gene_results) gene_group_result_fields = gene_results.group_results.dtype.value_type.fields gene_results_dsv = gene_results gene_results_dsv = gene_results_dsv.transmute(group_results=hl.array( gene_results_dsv.group_results ).map(lambda group_and_result: group_and_result[1].annotate( group=group_and_result[0]).select("group", *gene_group_result_fields))) gene_results_dsv = gene_results_dsv.explode(gene_results_dsv.group_results, name="group_result") gene_results_dsv = gene_results_dsv.transmute( **gene_results_dsv.group_result) gene_results_dsv.export( os.path.join(output_prefix, f"{dataset_id}_gene_results.tsv.bgz")) variant_results_path = os.path.join(dataset_prefix, "variant_results.ht") variant_results = hl.read_table(variant_results_path) validate_variant_results_table(variant_results) variant_group_result_fields = variant_results.group_results.dtype.value_type.fields variant_results_dsv = variant_results variant_results_dsv = variant_results_dsv.transmute( **variant_results_dsv.info) variant_results_dsv = variant_results_dsv.transmute( group_results=hl.array(variant_results_dsv.group_results).map( lambda group_and_result: group_and_result[ 1].annotate(group=group_and_result[0]).select( "group", *variant_group_result_fields))) variant_results_dsv = variant_results_dsv.explode( variant_results_dsv.group_results, name="group_result") variant_results_dsv = variant_results_dsv.transmute( **variant_results_dsv.group_result) variant_results_dsv.export( os.path.join(output_prefix, f"{dataset_id}_variant_results.tsv.bgz")) variant_results_groups = variant_results.aggregate( hl.agg.explode(hl.agg.collect_as_set, variant_results.group_results.keys())) variant_info_fields = variant_results.info.dtype.fields variant_base_fields = set( variant_results.row_value) - {"info", "group_results"} all_fields = list(variant_base_fields) + list(variant_info_fields) + list( variant_group_result_fields) assert len(all_fields) == len(set(all_fields)), "Conflicting field names" variant_results_vcf = variant_results variant_results_vcf = variant_results_vcf.annotate( groups=variant_results_vcf.group_results.keys()) variant_results_vcf = variant_results_vcf.select(info=hl.struct( **{f: variant_results_vcf[f] for f in variant_base_fields}, **{f: variant_results_vcf.info[f] for f in variant_info_fields}, groups=variant_results_vcf.groups, **dict( map( lambda f: ( f, variant_results_vcf.groups.map( lambda group: variant_results_vcf.group_results[group][ f]), ), variant_group_result_fields, )), ), ) def _convert_type(field): if isinstance(field.dtype, hl.tarray): if field.dtype.element_type == hl.tbool: return field.map(hl.int) return field variant_results_vcf = variant_results_vcf.annotate( info=variant_results_vcf.info.annotate(**{ f: _convert_type(variant_results_vcf.info[f]) for f in all_fields })) with NamedTemporaryFile("w") as header_file: header_file.write( f"analysis_groups={','.join(variant_results_groups)}") hl.export_vcf( variant_results_vcf, os.path.join(output_prefix, f"{dataset_id}_variant_results.vcf.bgz"), append_to_header=f"file://{header_file.name}", metadata={ "info": { **{ f: { "Number": str(len(variant_results_groups)) } for f in variant_group_result_fields } } }, )
def main(args): hl.init(default_reference='GRCh38') hgdp_inputs = [] tgp_inputs = [] with hl.hadoop_open( 'gs://hgdp_tgp/misc/tgp_plus_hgdp_30x_reblocked_gvcfs.txt', 'r') as f: for line in f: line = line.strip() hgdp_inputs.append(line) if 'HGDP' in line else tgp_inputs.append( line) temp_bucket = 'gs://gnomad-tmp/tgp_hgdp' if args.get_sample_names: get_sample_names_from_list_of_files(tgp_inputs, get_samples_path('tgp')) get_sample_names_from_list_of_files(hgdp_inputs, get_samples_path('hgdp')) if args.create_sparse_mt: sample_names = get_sample_list_in_order(get_samples_path('tgp'), tgp_inputs) hl.experimental.run_combiner(tgp_inputs, out_file=get_reference_mt_path( 'tgp', sparse=True), tmp_path=temp_bucket, overwrite=args.overwrite, header=get_header_path('tgp'), sample_names=sample_names, use_genome_default_intervals=True) sample_names = get_sample_list_in_order(get_samples_path('hgdp'), hgdp_inputs) hl.experimental.run_combiner(hgdp_inputs, out_file=get_reference_mt_path( 'hgdp', sparse=True), tmp_path=temp_bucket, overwrite=args.overwrite, header=get_header_path('hgdp'), sample_names=sample_names, use_genome_default_intervals=True) tgp_mt = hl.read_matrix_table(get_reference_mt_path('tgp', sparse=True)) tgp_mt = tgp_mt.annotate_entries( gvcf_info=tgp_mt.gvcf_info.drop('MQ0', 'VariantType')).drop( 'AB', 'MQ0') hgdp_mt = hl.read_matrix_table( get_reference_mt_path('hgdp', sparse=True)) hgdp_mt = hgdp_mt.annotate_entries(gvcf_info=hgdp_mt.gvcf_info.select( *tgp_mt.gvcf_info)) mt = combine_gvcfs([tgp_mt, hgdp_mt]) mt.write(get_reference_mt_path(sparse=True), overwrite=args.overwrite) if args.densify_mt: mt = hl.read_matrix_table(get_reference_mt_path( sparse=True)).key_rows_by('locus', 'alleles') mt = hl.experimental.densify(hl.experimental.sparse_split_multi(mt)) mt = mt.filter_rows(hl.len(mt.alleles) > 1) mt.naive_coalesce(5000).write(get_reference_mt_path(), args.overwrite) mt = hl.read_matrix_table(get_reference_mt_path()).drop('gvcf_info') hl.export_vcf(mt, get_reference_mt_path(extension='vcf.bgz'), parallel='header_per_shard')
def export_vcf(mt_path): mt = hl.read_matrix_table(hl.read_matrix_table(mt_path)) out = hl.utils.new_temp_file(suffix='vcf.bgz') hl.export_vcf(mt, out)
with herzog.Cell("python"): mt = mt.annotate_rows(info=mt.info.annotate(AC=mt.variant_qc.AC)) with herzog.Cell("markdown"): """ We export the VCF as several files (shards) to speed up the process. """ with herzog.Cell("python"): start_vcf_write_time = time.time() with herzog.Cell("python"): mt = mt.repartition(25) hl.export_vcf(mt, bucket + 'MyProject_MAFgt0.01.vcf.bgz', parallel='header_per_shard') with herzog.Cell("python"): elapsed_vcf_write_time = time.time() - start_vcf_write_time with herzog.Cell("python"): print(timedelta(seconds=elapsed_vcf_write_time)) with herzog.Cell("markdown"): """ Check that these files were successfully loaded to the bucket: """ with herzog.Cell("python"): get_ipython().system(
mt.count_rows() * 2))) mt = mt.annotate_rows(info=mt.info.annotate( cohort_names=mt.MAF_cohorts.keys())) mt = mt.annotate_rows(info=mt.info.annotate( MAF_cohorts_values=mt.MAF_cohorts.values())) mt = mt.annotate_rows(info=mt.info.annotate( AN_cohorts_values=mt.AN_cohorts.values())) mt = mt.annotate_rows(info=mt.info.annotate( AC_cohorts=mt.AC_cohorts.values())) mt = mt.annotate_rows(info=mt.info.annotate( missingness_cohorts_values=mt.missingness_cohorts.values())) mt = mt.checkpoint( f'{lustre_dir}/variant_qc/megaWES_mt_with_stats_w20_June_2021.mt', overwrite=True) hl.export_vcf( mt, f'{lustre_dir}/variant_qc/megaWES_mt_after_RF_cohort_stats_w20.vcf.bgz', parallel='separate_header') mt1 = mt.select_entries() mt_fin = mt1.filter_cols(mt1['s'] == 'sample') hl.export_vcf( mt_fin, f"{lustre_dir}/variant_qc/megaWES_mt_after_RF_stats_for_VEP_w20.vcf.bgz", parallel='separate_header')
def main( json_str: str, dataset: str, reference: Optional[str], multi_fam: bool, skip_mt: bool, skip_vcf: bool, ): """ This takes the family structures encoded in the JSON str and creates a number of single-family objects in Test The option to include a multi-family structure is useful for experimenting with operations mapping MOI patterns per-family within a larger dataset. This will be useful in analysing runtime/cost of analysing a single family with and without extracting from a larger dataset first Additional options permit the skipping of either the VCF or MT for each family subset """ # parse the families dict from the input string, e.g. '{'fam1':['sam1','sam2']}' families_dict = json.loads(json_str) gcp_test_bucket = f'gs://cpg-{dataset}-test' # path built in this way to pass separate components to the GCP file check gcp_main_bucket = f'gs://cpg-{dataset}-main' mt_in_bucket = os.path.join('mt', f'{dataset}.mt') gcp_mt_full = os.path.join(gcp_main_bucket, mt_in_bucket) # collect all unique sample IDs for a single filter on the MT all_samples = get_all_unique_members(families_dict) mt = read_mt(gcp_mt_full, reference=reference) if multi_fam: # pull all samples from all requested families multi_fam_mt = obtain_mt_subset(mt, list(all_samples)) # force-write this family MT to a test location multi_fam_mt.write( os.path.join(gcp_test_bucket, 'multiple_families.mt'), overwrite=True ) # check samples all present check_samples_in_mt(all_samples, families_dict, mt) # for each family, dump both a small MT and a VCF containing the same samples/variants for family, samples in families_dict.items(): # pull out only this family's samples from the MT family_mt = obtain_mt_subset(mt, samples) if not skip_mt: # write this family MT to a test location family_mt.write(os.path.join(gcp_test_bucket, f'{family}.mt')) if not skip_vcf: # revert to a VCF file format, and write to a test location # hail generic method, so the MT is an argument hl.export_vcf(family_mt, os.path.join(gcp_test_bucket, f'{family}.vcf.bgz'))
#Now need to filter rows again by doing a new variant_qc. #Drop the previous variant_qc and sample_qc print( "8. Drop old sample and variant QC and calculate new ones after filtering" ) fields_to_drop = ['variant_QC_Hail', 'sample_QC_Hail'] mt1 = mt_entries_filtered.drop(*fields_to_drop) mt2 = hl.sample_qc(mt1, name='sample_QC_Hail') mt3 = hl.variant_qc(mt2, name='variant_QC_Hail') #Remove variants with missingness > 3% mt = mt3.filter_rows(mt3.variant_QC_Hail.call_rate >= 0.97) print("Finished filtering. Now writing out.") #8.Export VCF only variants -no genotypes print("Export VCF only variants -no genotypes") mt1 = mt.select_entries() mt2 = mt1.filter_cols(mt1['s'] == 'samplenone') hl.export_vcf(mt2, f"{tmp_dir}/intervalwes/VCFs/shard1.vcf.bgz") #9. Write matrixtable print("Write matrixtable") mt = mt.checkpoint(f"{tmp_dir}/intervalwes/shard1_filtered.mt", overwrite=True) print("Finished writing mt. ")
#this is a python script loosely based on Kumar and Konrad's effort here: https://github.com/mkveerapen/covid19_sequencing #again, some of the QC at our institution was done by our genome center, and therefore you should refer to the above link for more thorough QC #specifically, variant recalibration should still be done, even if not shown here, can discuss with me on how to do it using gatk. import hail as hl #tmp_dir is where some of the temporary computations are done. I would make sure to assign it to a folder that does not have a strict data cap. hl.init(spark_conf=None, tmp_dir='/path/to/tmp_dir/') #import the data and sample QC hl.import_vcf('/path/to/sequence.file.normID.noChrM.vcf.gz', min_partitions=4, reference_genome='GRCh38', force_bgz=True).write('/hailFiles/hail.full.normID.noChrM.mt', overwrite=True) mtAll = hl.read_matrix_table('/hailFiles/hail.full.noChrM.mt') mtAll = mtAll.annotate_entries(AB=(mtAll.AD[1] / hl.sum(mtAll.AD))) mtAll = hl.sample_qc(mtAll) mtAll = mtAll.filter_cols((mtAll.sample_qc.call_rate >= 0.97) & (mtAll.sample_qc.dp_stats.mean >= 20)) mtAll = mtAll.filter_entries((mtAll.GQ >= 20) & (mtAll.DP >= 10) & ( (mtAll.GT.is_hom_ref() & (mtAll.AB <= 0.1)) | (mtAll.GT.is_het() & (mtAll.AB >= 0.25) & (mtAll.AB <= 0.75)) | (mtAll.GT.is_hom_var() & (mtAll.AB >= 0.9)))) hl.export_vcf(mtAll, '/path/to/sequence.file.normID.GTflt.AB.noChrM.vcf.gz')
sample_file=ukb_sf, index_file_map=file_map, _row_fields=['rsid']) # Extracting SNPs of interest mt_f = hl.filter_intervals(mt, ploci) mt_f = hl.variant_qc(mt_f) chromdat['chrompos'] = chromdat['chrom'] + ':' + chromdat[ 'hg19_pos'].astype(str) chromdat_hl = hl.Table.from_pandas(chromdat) chromdat_hl = chromdat_hl.annotate( locus=hl.parse_locus(chromdat_hl.chrompos, reference_genome='GRCh37')) chromdat_hl = chromdat_hl.key_by('locus') mt_f = mt_f.annotate_rows(**chromdat_hl[mt_f.locus]) flip = hl.case().when(mt_f.ea == mt_f.alleles[0], True).when(mt_f.ea == mt_f.alleles[1], False).or_missing() mt_f = mt_f.annotate_rows(flip=flip) mt_f = mt_f.annotate_rows( prior=2 * hl.if_else(mt_f.flip, mt_f.variant_qc.AF[0], mt_f.variant_qc.AF[1])) mt_f = mt_f.select_entries(G=hl.coalesce( hl.if_else(mt_f.flip, 2 - mt_f.GT.n_alt_alleles(), mt_f.GT.n_alt_alleles()), mt_f.prior)) ## Exporting result output = '/ludc/Home/daniel_c/dva/files/ukbgeno/chrom{}.vcf.bgz'.format(ch) hl.export_vcf(mt_f, output) ## Removing log file logfile = glob.glob('*.log') os.remove(logfile[0])
#! /usr/bin/python import sys import hail as hl n_samples = int(sys.argv[1]) n_variants = int(sys.argv[2]) path = sys.argv[3] mt = hl.balding_nichols_model(1, n_samples, n_variants) mt = mt.key_cols_by(s = hl.str(mt.sample_idx)) mt = mt.annotate_entries(GT = hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2)) hl.export_vcf(mt, path + ".vcf") hl.export_plink(mt, path)
mt.cohort, hl.min((hl.agg.count_where(hl.is_missing(mt['GT']))) / mt.count_rows() * 2))) mt = mt.annotate_rows(info=mt.info.annotate( cohort_names=mt.MAF_cohorts.keys())) mt = mt.annotate_rows(info=mt.info.annotate( MAF_cohorts_values=mt.MAF_cohorts.values())) mt = mt.annotate_rows(info=mt.info.annotate( AN_cohorts_values=mt.AN_cohorts.values())) mt = mt.annotate_rows(info=mt.info.annotate( AC_cohorts=mt.AC_cohorts.values())) mt = mt.annotate_rows(info=mt.info.annotate( missingness_cohorts_values=mt.missingness_cohorts.values())) # mt = mt.checkpoint( # f'{tmp_dir}/Sanger_WES_mt_with_stats.mt', overwrite=True) hl.export_vcf( mt, f'{tmp_dir}/Sanger_WES_chr1-7and20_after_RF_cohort_stats.vcf.bgz', parallel='separate_header') mt1 = mt.select_entries() mt_fin = mt1.filter_cols(mt1['s'] == 'sample') hl.export_vcf(mt_fin, f"{tmp_dir}/Sanger_WES_chr1-7and20_stats_for_VEP.vcf.bgz", parallel='separate_header')