Beispiel #1
0
def main(args):
    full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt')

    # liftover chains
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)

    chips = hl.hadoop_open(args.chip_loci)
    chip_dict = {}
    for chip in chips:
        chip = chip.strip().split()
        chip_pos = hl.import_table(chip[1],
                                   filter='\[Controls\]',
                                   skip_blank_lines=True)
        chip_pos = chip_pos.filter(
            hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains(
                chip_pos.chr))
        chip_pos = chip_pos.key_by(
            locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos)))

        #  liftover chip position info
        chip_pos = chip_pos.annotate(
            new_locus=hl.liftover(chip_pos.locus, 'GRCh38'))
        chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus))
        chip_pos = chip_pos.key_by(locus=chip_pos.new_locus)

        # filter full vcf to sites in genotype data
        geno_vcf = full_vcf.filter_rows(hl.is_defined(
            chip_pos[full_vcf.locus]))
        hl.export_vcf(
            geno_vcf,
            'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
Beispiel #2
0
        def plinkify(ds, min=None, max=None):
            vcf = utils.new_temp_file(prefix="plink", suffix="vcf")
            plinkpath = utils.new_temp_file(prefix="plink")
            hl.export_vcf(ds, vcf)
            threshold_string = "{} {}".format("--min {}".format(min) if min else "",
                                              "--max {}".format(max) if max else "")

            plink_command = "plink --double-id --allow-extra-chr --vcf {} --genome full --out {} {}" \
                .format(utils.uri_path(vcf),
                        utils.uri_path(plinkpath),
                        threshold_string)
            result_file = utils.uri_path(plinkpath + ".genome")

            syscall(plink_command, shell=True, stdout=DEVNULL, stderr=DEVNULL)

            ### format of .genome file is:
            # _, fid1, iid1, fid2, iid2, rt, ez, z0, z1, z2, pihat, phe,
            # dst, ppc, ratio, ibs0, ibs1, ibs2, homhom, hethet (+ separated)

            ### format of ibd is:
            # i (iid1), j (iid2), ibd: {Z0, Z1, Z2, PI_HAT}, ibs0, ibs1, ibs2
            results = {}
            with open(result_file) as f:
                f.readline()
                for line in f:
                    row = line.strip().split()
                    results[(row[1], row[3])] = (list(map(float, row[6:10])),
                                                 list(map(int, row[14:17])))
            return results
Beispiel #3
0
def main(args):

    hl.init(
        default_reference=args.import_build,
        log="/import_vcf.log",
        tmp_dir="hdfs:///import_vcf.tmp/",
    )

    logger.info("Importing filters info...")
    filters_ht = hl.import_vcf(
        args.part_two_path,
        reference_genome=args.import_build,
        force_bgz=True,
        find_replace=("nul", "."),
    ).rows()
    logger.info("Importing VCF...")
    # NOTE: always assumes file is bgzipped
    mt = hl.import_vcf(
        args.part_one_path,
        force_bgz=True,
        reference_genome=args.import_build,
        find_replace=("nul", "."),
    )
    mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
    logger.info(f"MT count: {mt.count()}")
    hl.export_vcf(mt, args.vcf_out, parallel="header_per_shard")
Beispiel #4
0
def export_qced_file(mt: hl.MatrixTable,
                     out_dir: str,
                     basename: str,
                     export_type='hail'):
    outname = basename + '_qced'

    if export_type == 'hail':
        mt.write('{}GWASpy/Preimp_QC/{}.mt'.format(out_dir, outname),
                 overwrite=True)

    elif export_type == 'plink':
        hl.export_plink(dataset=mt,
                        output='{}GWASpy/Preimp_QC/{}'.format(
                            out_dir, outname),
                        fam_id=mt.fam_id,
                        ind_id=mt.s,
                        pat_id=mt.pat_id,
                        mat_id=mt.mat_id,
                        is_female=mt.is_female,
                        pheno=mt.is_case,
                        varid=mt.rsid)

    else:
        hl.export_vcf(mt,
                      '{}GWASpy/Preimp_QC/{}.vcf.bgz'.format(out_dir, outname))
Beispiel #5
0
 def test_not_identical_headers(self):
     t = new_temp_file('vcf')
     mt = hl.import_vcf(resource('sample.vcf'))
     hl.export_vcf(mt.filter_cols((mt.s != "C1048::HG02024") & (mt.s != "HG00255")), t)
     
     with self.assertRaisesRegex(FatalError, 'invalid sample IDs'):
         (hl.import_vcf([resource('sample.vcf'), t])
          ._force_count_rows())
Beispiel #6
0
    def test_not_identical_headers(self):
        t = new_temp_file('vcf')
        mt = hl.import_vcf(resource('sample.vcf'))
        hl.export_vcf(
            mt.filter_cols((mt.s != "C1048::HG02024") & (mt.s != "HG00255")),
            t)

        with self.assertRaisesRegex(FatalError, 'invalid sample IDs'):
            (hl.import_vcf([resource('sample.vcf'), t])._force_count_rows())
Beispiel #7
0
    def test_export_vcf(self):
        dataset = hl.import_vcf(resource('sample.vcf.bgz'))
        vcf_metadata = hl.get_vcf_metadata(resource('sample.vcf.bgz'))
        hl.export_vcf(dataset, '/tmp/sample.vcf', metadata=vcf_metadata)
        dataset_imported = hl.import_vcf('/tmp/sample.vcf')
        self.assertTrue(dataset._same(dataset_imported))

        metadata_imported = hl.get_vcf_metadata('/tmp/sample.vcf')
        self.assertDictEqual(vcf_metadata, metadata_imported)
def main(args):
    '''
    participant_data = args.participant_data
    sample_map = args.sample_map
    coverage_mt_path = args.coverage_mt_path
    vcf_col = args.vcf_col
    artifact_prone_sites_path = args.artifact_prone_sites_path
    output_bucket = args.output_bucket
    file_suffix = args.file_suffix
    minimum_homref_coverage = args.minimum_homref_coverage
    chunk_size = args.chunk_size
    overwrite = args.overwrite
    '''

    input_tsv = args.input_tsv

    artifact_prone_sites_path = args.artifact_prone_sites_path
    chunk_size = args.chunk_size
    overwrite = args.overwrite
    output_bucket = args.output_bucket
    file_suffix = args.file_suffix

    minimum_homref_coverage = args.minimum_homref_coverage
    coverage_mt_path = args.coverage_mt_path
    '''
    logger.info("Confirming existence of individual sample vcfs...")
    confirmed_vcfs = check_vcf_existence(
        participant_data, vcf_col, sample_map, output_bucket
    )
    '''

    logger.info("Combining VCFs...")
    combined_mt = join_mitochondria_vcfs_into_mt(input_tsv, output_bucket,
                                                 chunk_size)
    output_path_mt = f"{output_bucket}/raw_combined_mt.mt"
    combined_mt = combined_mt.checkpoint(output_path_mt, overwrite=overwrite)

    logger.info("Removing certain FT filters...")
    combined_mt = remove_FT_values(combined_mt)

    logger.info("Determining homoplasmic reference sites...")
    combined_mt = determine_hom_refs(combined_mt, coverage_mt_path,
                                     minimum_homref_coverage)

    logger.info("Applying artifact_prone_site fiter...")
    combined_mt = apply_mito_artifact_filter(combined_mt,
                                             artifact_prone_sites_path)

    logger.info("Writing combined MT and VCF...")
    # set the file names for output files
    out_vcf = f"{output_bucket}/combined_{file_suffix}.vcf.bgz"
    out_mt = f"{output_bucket}/combined_{file_suffix}.mt"

    combined_mt.write(out_mt, overwrite=True)
    hl.export_vcf(combined_mt, out_vcf, metadata=META_DICT)

    logger.info("DONE")
def main(args):
    hl.init(master=f'local[{args.n_threads}]',
            log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'),
            default_reference=args.reference)

    sys.path.append('/')
    add_args = []
    if args.additional_args is not None:
        add_args = args.additional_args.split(',')
    load_module = importlib.import_module(args.load_module)
    mt = getattr(load_module, args.load_mt_function)(*add_args)

    if args.gene_map_ht_path is None:
        interval = [hl.parse_locus_interval(args.interval)]
    else:
        gene_ht = hl.read_table(args.gene_map_ht_path)
        if args.gene is not None:
            gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene)
            interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False)
        else:
            interval = [hl.parse_locus_interval(args.interval)]
            gene_ht = hl.filter_intervals(gene_ht, interval)

        gene_ht = gene_ht.filter(hl.set(args.groups.split(',')).contains(gene_ht.annotation))
        gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t')
                       ).key_by().drop('start').export(args.group_output_file, header=False)
        # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants`

    if not args.no_adj:
        mt = mt.filter_entries(mt.adj)

    mt = hl.filter_intervals(mt, interval)

    if not args.input_bgen:
        mt = mt.select_entries('GT')
        mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0)
    mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1])

    if args.callrate_filter:
        mt = mt.filter_rows(hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter)

    if args.export_bgen:
        if not args.input_bgen:
            mt = mt.annotate_entries(GT=hl.if_else(mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT))
            mt = gt_to_gp(mt)
            mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing)
        hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid)
    else:
        mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0)))
        # Note: no mean-imputation for VCF
        hl.export_vcf(mt, args.output_file)
Beispiel #10
0
    def test_export_vcf(self):
        dataset = hl.import_vcf(resource('sample.vcf.bgz'))
        vcf_metadata = hl.get_vcf_metadata(resource('sample.vcf.bgz'))
        hl.export_vcf(dataset, '/tmp/sample.vcf', metadata=vcf_metadata)
        dataset_imported = hl.import_vcf('/tmp/sample.vcf')
        self.assertTrue(dataset._same(dataset_imported))

        no_sample_dataset = dataset.filter_cols(False).select_entries()
        hl.export_vcf(no_sample_dataset, '/tmp/no_sample.vcf', metadata=vcf_metadata)
        no_sample_dataset_imported = hl.import_vcf('/tmp/no_sample.vcf')
        self.assertTrue(no_sample_dataset._same(no_sample_dataset_imported))

        metadata_imported = hl.get_vcf_metadata('/tmp/sample.vcf')
        self.assertDictEqual(vcf_metadata, metadata_imported)
Beispiel #11
0
def main(args):
    hl.init(default_reference="GRCh38", log="/qc_annotations.log")

    if args.compute_info:
        compute_info().write(get_info(split=False).path,
                             overwrite=args.overwrite)

    if args.split_info:
        split_info().write(get_info(split=True).path, overwrite=args.overwrite)

    if args.export_info_vcf:
        info_ht = get_info(split=False).ht()
        hl.export_vcf(ht_to_vcf_mt(info_ht), info_vcf_path())

    if args.generate_allele_data:
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
        generate_allele_data(mt.rows()).write(allele_data.path,
                                              overwrite=args.overwrite)

    if args.generate_ac:  # TODO: compute AC and qc_AC as part of compute_info
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True)
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

        ht = generate_ac(mt).checkpoint(
            "gs://gnomad-tmp/ac_tmp.ht",
            overwrite=args.overwrite,
            _read_if_exists=not args.overwrite,
        )
        ht.repartition(10000, shuffle=False).write(qc_ac.path,
                                                   overwrite=args.overwrite)

    if args.generate_fam_stats:
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, samples_meta=True)
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
        fam_stats_ht = generate_fam_stats(mt, trios.path)
        fam_stats_ht = fam_stats_ht.checkpoint(
            "gs://gnomad-tmp/fam_stats_tmp.ht",
            overwrite=args.overwrite,
            _read_if_exists=not args.overwrite,
        )
        fam_stats_ht = fam_stats_ht.repartition(10000, shuffle=False)
        fam_stats_ht.write(fam_stats.path, overwrite=args.overwrite)

    if args.export_transmitted_singletons_vcf:
        export_transmitted_singletons_vcf()

    if args.vep:
        run_vep(vep_version=args.vep_version).write(vep.path,
                                                    overwrite=args.overwrite)
Beispiel #12
0
def main(args):

    print("main")

    run_hash = "91b132aa"
    ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_ranked_denovo_ddd_comp.ht'
    )

    mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-7and20_split_sampleqc_filtered.mt'
    )
    mt = mt.annotate_rows(Variant_Type=hl.cond(
        (hl.is_snp(mt.alleles[0], mt.alleles[1])), "SNP",
        hl.cond(
            hl.is_insertion(mt.alleles[0], mt.alleles[1]), "INDEL",
            hl.cond(hl.is_deletion(mt.alleles[0], mt.alleles[1]), "INDEL",
                    "Other"))))
    mt = mt.annotate_rows(info=mt.info.annotate(
        rf_probability=ht[mt.row_key].rf_probability['TP']))
    mt = mt.annotate_rows(info=mt.info.annotate(score=ht[mt.row_key].score))

    filter_column_annotation = (
        hl.case().when(
            ((mt.Variant_Type == "SNP") & (mt.info.rf_probability <= 0.90)),
            "PASS").when(((mt.Variant_Type == "INDEL") &
                          (mt.info.rf_probability <= 0.80)),
                         "PASS").default(".")  # remove everything else
    )

    # mt_annotated = mt.annotate_rows(mt.filters=filter_column_annotation)
    mt1 = mt.annotate_rows(filtercol=((filter_column_annotation)))
    mt_fail = mt1.filter_rows(mt1.filtercol == ".")
    print(mt_fail.count())

    mt2 = mt1.annotate_rows(filters=mt1.filters.add(mt1.filtercol))
    mt_fail2 = mt2.filter_rows(mt2.filters.contains("."))
    mt_pass = mt2.filter_rows(mt2.filters.contains("PASS"))
    print(mt_fail2.count())
    print(mt_pass.count())

    mt2 = mt2.checkpoint(
        f'{tmp_dir}/Sanger_cohorts_chr1-7and20_after_RF_final.mt',
        overwrite=True)

    hl.export_vcf(
        mt2,
        f'{tmp_dir}/Sanger_cohorts_chr1-7and20_after_RF_final.vcf.bgz',
        parallel='separate_header')
Beispiel #13
0
def main(args):
    tgp_vcf = hl.import_vcf(args.input_vcf,
                            force_bgz=True,
                            header_file='gs://neurogap/reference_data/header',
                            reference_genome='GRCh38',
                            min_partitions=200)
    sample_info = hl.import_table(args.sample_info).key_by('sample')
    tgp_vcf = tgp_vcf.annotate_cols(**sample_info[tgp_vcf.s])
    afr_vcf = tgp_vcf.filter_cols(
        (tgp_vcf.super_pop == 'AFR')
        & ~((tgp_vcf.pop == 'ASW') |
            (tgp_vcf.pop == 'ACB')))  #  NA18498 is missing
    sample_info.filter(hl.is_missing(
        tgp_vcf.cols()[sample_info['sample']])).show()
    hl.export_vcf(afr_vcf, args.output_filename)
Beispiel #14
0
    def test_export_vcf(self):
        dataset = hl.import_vcf(resource('sample.vcf.bgz'))
        vcf_metadata = hl.get_vcf_metadata(resource('sample.vcf.bgz'))
        hl.export_vcf(dataset, '/tmp/sample.vcf', metadata=vcf_metadata)
        dataset_imported = hl.import_vcf('/tmp/sample.vcf')
        self.assertTrue(dataset._same(dataset_imported))

        no_sample_dataset = dataset.filter_cols(False).select_entries()
        hl.export_vcf(no_sample_dataset,
                      '/tmp/no_sample.vcf',
                      metadata=vcf_metadata)
        no_sample_dataset_imported = hl.import_vcf('/tmp/no_sample.vcf')
        self.assertTrue(no_sample_dataset._same(no_sample_dataset_imported))

        metadata_imported = hl.get_vcf_metadata('/tmp/sample.vcf')
        self.assertDictEqual(vcf_metadata, metadata_imported)
def main(args):

    print("main")

    run_hash = "91b132aa"
    ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_ranked_denovo_ddd_comp.ht'
    )

    mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_chr1-7and20_after_RF_final.mt'
    )

    hl.export_vcf(
        mt,
        f'{tmp_dir}/Sanger_cohorts_chr1-7and20_after_RF_final.vcf.bgz',
        parallel='separate_header')
Beispiel #16
0
    def test_export_plink(self):
        vcf_file = resource('sample.vcf')
        mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10))

        # permute columns so not in alphabetical order!
        import random
        indices = list(range(mt.count_cols()))
        random.shuffle(indices)
        mt = mt.choose_cols(indices)

        split_vcf_file = uri_path(new_temp_file())
        hl_output = uri_path(new_temp_file())
        plink_output = uri_path(new_temp_file())
        merge_output = uri_path(new_temp_file())

        hl.export_vcf(mt, split_vcf_file)
        hl.export_plink(mt, hl_output)

        run_command(["plink", "--vcf", split_vcf_file,
                     "--make-bed", "--out", plink_output,
                     "--const-fid", "--keep-allele-order"])

        data = []
        with open(uri_path(plink_output + ".bim")) as file:
            for line in file:
                row = line.strip().split()
                row[1] = ":".join([row[0], row[3], row[5], row[4]])
                data.append("\t".join(row) + "\n")

        with open(plink_output + ".bim", 'w') as f:
            f.writelines(data)

        run_command(["plink", "--bfile", plink_output,
                     "--bmerge", hl_output, "--merge-mode",
                     "6", "--out", merge_output])

        same = True
        with open(merge_output + ".diff") as f:
            for line in f:
                row = line.strip().split()
                if row != ["SNP", "FID", "IID", "NEW", "OLD"]:
                    same = False
                    break

        self.assertTrue(same)
Beispiel #17
0
    def test_impute_sex_same_as_plink(self):
        import subprocess as sp

        ds = hl.import_vcf(resource('x-chromosome.vcf'))

        sex = hl.impute_sex(ds.GT, include_par=True)

        vcf_file = utils.uri_path(utils.new_temp_file(prefix="plink", suffix="vcf"))
        out_file = utils.uri_path(utils.new_temp_file(prefix="plink"))

        hl.export_vcf(ds, vcf_file)

        try:
            out = sp.check_output(
                ["plink", "--vcf", vcf_file, "--const-fid", "--check-sex",
                 "--silent", "--out", out_file],
                stderr=sp.STDOUT)
        except sp.CalledProcessError as e:
            print(e.output)
            raise e

        plink_sex = hl.import_table(out_file + '.sexcheck',
                                    delimiter=' +',
                                    types={'SNPSEX': hl.tint32,
                                           'F': hl.tfloat64})
        plink_sex = plink_sex.select('IID', 'SNPSEX', 'F')
        plink_sex = plink_sex.select(
            s=plink_sex.IID,
            is_female=hl.cond(plink_sex.SNPSEX == 2,
                              True,
                              hl.cond(plink_sex.SNPSEX == 1,
                                      False,
                                      hl.null(hl.tbool))),
            f_stat=plink_sex.F).key_by('s')

        sex = sex.select(s=sex.s,
                         is_female=sex.is_female,
                         f_stat=sex.f_stat)

        self.assertTrue(plink_sex._same(sex.select_globals(), tolerance=1e-3))

        ds = ds.annotate_rows(aaf=(agg.call_stats(ds.GT, ds.alleles)).AF[1])

        self.assertTrue(hl.impute_sex(ds.GT)._same(hl.impute_sex(ds.GT, aaf='aaf')))
Beispiel #18
0
    def test_export_plink(self):
        vcf_file = resource('sample.vcf')
        mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10))

        split_vcf_file = uri_path(new_temp_file())
        hl_output = uri_path(new_temp_file())
        plink_output = uri_path(new_temp_file())
        merge_output = uri_path(new_temp_file())

        hl.export_vcf(mt, split_vcf_file)
        hl.export_plink(mt, hl_output)

        run_command([
            "plink", "--vcf", split_vcf_file, "--make-bed", "--out",
            plink_output, "--const-fid", "--keep-allele-order"
        ])

        data = []
        with open(uri_path(plink_output + ".bim")) as file:
            for line in file:
                row = line.strip().split()
                row[1] = ":".join([row[0], row[3], row[5], row[4]])
                data.append("\t".join(row) + "\n")

        with open(plink_output + ".bim", 'w') as f:
            f.writelines(data)

        run_command([
            "plink", "--bfile", plink_output, "--bmerge", hl_output,
            "--merge-mode", "6", "--out", merge_output
        ])

        same = True
        with open(merge_output + ".diff") as f:
            for line in f:
                row = line.strip().split()
                if row != ["SNP", "FID", "IID", "NEW", "OLD"]:
                    same = False
                    break

        self.assertTrue(same)
def export_transmitted_singletons_vcf():
    """
    Exports the transmitted singleton Table to a VCF.

    :return: None
    """
    qc_ac_ht = qc_ac.ht()

    for transmission_confidence in ['raw', 'adj']:
        ts_ht = qc_ac_ht.filter(
            (fam_stats.ht()[qc_ac_ht.key][f'n_transmitted_{transmission_confidence}'] == 1) &
            (qc_ac_ht.ac_qc_samples_raw == 2)
        )

        ts_ht = ts_ht.annotate(
            s=hl.null(hl.tstr)
        )

        ts_mt = ts_ht.to_matrix_table_row_major(columns=['s'], entry_field_name='s')
        ts_mt = ts_mt.filter_cols(False)
        hl.export_vcf(ts_mt, get_transmitted_singleton_vcf_path(transmission_confidence), tabix=True)
Beispiel #20
0
def main(args):
    """
    Subset joint-called VCF or MT to desired samples.

    Used when CMG/MGRC collaborators request VCFs of their data.
    """
    hl.init(log="/subset.log", default_reference="GRCh38")

    if args.vcf_path:
        logger.info("Importing VCF...")
        logger.warning("Assuming VCF is bgzipped!")
        mt = hl.import_vcf(args.vcf_path,
                           force_bgz=True,
                           reference_genome=args.import_build)

    else:
        logger.info("Reading in MT...")
        mt = hl.read_matrix_table(args.mt_path)

    logger.info(f"Input MT counts: {mt.count()}")
    mt.describe()

    if args.mapping:
        logger.info("Mapping VCF IDs to seqr IDs...")
        logger.warning("Assuming mapping file the field names s and seqr_id!")
        mt = remap_sample_ids(mt, args.mapping)

    logger.info("Subsetting to specified samples and their variants...")
    mt = subset_samples_and_variants(mt,
                                     args.sample_list,
                                     header=args.no_header,
                                     table_key=args.table_key)
    logger.info(f"MT counts after subsetting: {mt.count()}")

    logger.info("Exporting VCF...")
    if "bgz" not in args.vcf_out:
        logger.warning(
            "Path to output VCF does not contain '.bgz'; export might be really slow!"
        )
    hl.export_vcf(mt, args.vcf_out, parallel=args.parallel)
print("importing vds files")
vds = hl.read_matrix_table(vds_splitmulti_file)

print("removing lcr")
lcr = hl.import_bed(lcr_file, reference_genome='GRCh38')
vds = vds.filter_rows(hl.is_defined(lcr[vds.locus]), keep=False)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# II. Downcoding
#     Only keep GT for individual column
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("downcoding...")
vds = vds.select_entries(vds.GT)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# III. Write output VCF
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("writing out vcf...")
hl.export_vcf(vds, qced_vcf_file)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# print Runtime
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stop = timeit.default_timer()

print("runtime: " + str(stop - start) + " seconds")
Beispiel #22
0
def export_vcf():
    mt = hl.read_matrix_table(resource('profile.mt'))
    out = hl.utils.new_temp_file(suffix='vcf.bgz')
    hl.export_vcf(mt, out)
def prepare_downloads_for_dataset(dataset_id):
    output_path = pipeline_config.get("output", "staging_path")

    dataset_prefix = os.path.join(output_path, dataset_id.lower())
    output_prefix = os.path.join(output_path, "downloads", dataset_id)

    gene_results_path = os.path.join(dataset_prefix, "gene_results.ht")
    gene_results = hl.read_table(gene_results_path)
    validate_gene_results_table(gene_results)

    gene_group_result_fields = gene_results.group_results.dtype.value_type.fields
    gene_results_dsv = gene_results
    gene_results_dsv = gene_results_dsv.transmute(group_results=hl.array(
        gene_results_dsv.group_results
    ).map(lambda group_and_result: group_and_result[1].annotate(
        group=group_and_result[0]).select("group", *gene_group_result_fields)))
    gene_results_dsv = gene_results_dsv.explode(gene_results_dsv.group_results,
                                                name="group_result")
    gene_results_dsv = gene_results_dsv.transmute(
        **gene_results_dsv.group_result)
    gene_results_dsv.export(
        os.path.join(output_prefix, f"{dataset_id}_gene_results.tsv.bgz"))

    variant_results_path = os.path.join(dataset_prefix, "variant_results.ht")
    variant_results = hl.read_table(variant_results_path)
    validate_variant_results_table(variant_results)

    variant_group_result_fields = variant_results.group_results.dtype.value_type.fields
    variant_results_dsv = variant_results
    variant_results_dsv = variant_results_dsv.transmute(
        **variant_results_dsv.info)
    variant_results_dsv = variant_results_dsv.transmute(
        group_results=hl.array(variant_results_dsv.group_results).map(
            lambda group_and_result: group_and_result[
                1].annotate(group=group_and_result[0]).select(
                    "group", *variant_group_result_fields)))
    variant_results_dsv = variant_results_dsv.explode(
        variant_results_dsv.group_results, name="group_result")
    variant_results_dsv = variant_results_dsv.transmute(
        **variant_results_dsv.group_result)
    variant_results_dsv.export(
        os.path.join(output_prefix, f"{dataset_id}_variant_results.tsv.bgz"))

    variant_results_groups = variant_results.aggregate(
        hl.agg.explode(hl.agg.collect_as_set,
                       variant_results.group_results.keys()))

    variant_info_fields = variant_results.info.dtype.fields
    variant_base_fields = set(
        variant_results.row_value) - {"info", "group_results"}

    all_fields = list(variant_base_fields) + list(variant_info_fields) + list(
        variant_group_result_fields)
    assert len(all_fields) == len(set(all_fields)), "Conflicting field names"

    variant_results_vcf = variant_results
    variant_results_vcf = variant_results_vcf.annotate(
        groups=variant_results_vcf.group_results.keys())
    variant_results_vcf = variant_results_vcf.select(info=hl.struct(
        **{f: variant_results_vcf[f]
           for f in variant_base_fields},
        **{f: variant_results_vcf.info[f]
           for f in variant_info_fields},
        groups=variant_results_vcf.groups,
        **dict(
            map(
                lambda f: (
                    f,
                    variant_results_vcf.groups.map(
                        lambda group: variant_results_vcf.group_results[group][
                            f]),
                ),
                variant_group_result_fields,
            )),
    ), )

    def _convert_type(field):
        if isinstance(field.dtype, hl.tarray):
            if field.dtype.element_type == hl.tbool:
                return field.map(hl.int)

        return field

    variant_results_vcf = variant_results_vcf.annotate(
        info=variant_results_vcf.info.annotate(**{
            f: _convert_type(variant_results_vcf.info[f])
            for f in all_fields
        }))

    with NamedTemporaryFile("w") as header_file:
        header_file.write(
            f"analysis_groups={','.join(variant_results_groups)}")

        hl.export_vcf(
            variant_results_vcf,
            os.path.join(output_prefix,
                         f"{dataset_id}_variant_results.vcf.bgz"),
            append_to_header=f"file://{header_file.name}",
            metadata={
                "info": {
                    **{
                        f: {
                            "Number": str(len(variant_results_groups))
                        }
                        for f in variant_group_result_fields
                    }
                }
            },
        )
Beispiel #24
0
def export_vcf():
    mt = hl.read_matrix_table(resource('profile.mt'))
    out = hl.utils.new_temp_file(suffix='vcf.bgz')
    hl.export_vcf(mt, out)
Beispiel #25
0
def main(args):
    hl.init(default_reference='GRCh38')

    hgdp_inputs = []
    tgp_inputs = []
    with hl.hadoop_open(
            'gs://hgdp_tgp/misc/tgp_plus_hgdp_30x_reblocked_gvcfs.txt',
            'r') as f:
        for line in f:
            line = line.strip()
            hgdp_inputs.append(line) if 'HGDP' in line else tgp_inputs.append(
                line)

    temp_bucket = 'gs://gnomad-tmp/tgp_hgdp'

    if args.get_sample_names:
        get_sample_names_from_list_of_files(tgp_inputs,
                                            get_samples_path('tgp'))
        get_sample_names_from_list_of_files(hgdp_inputs,
                                            get_samples_path('hgdp'))

    if args.create_sparse_mt:
        sample_names = get_sample_list_in_order(get_samples_path('tgp'),
                                                tgp_inputs)
        hl.experimental.run_combiner(tgp_inputs,
                                     out_file=get_reference_mt_path(
                                         'tgp', sparse=True),
                                     tmp_path=temp_bucket,
                                     overwrite=args.overwrite,
                                     header=get_header_path('tgp'),
                                     sample_names=sample_names,
                                     use_genome_default_intervals=True)
        sample_names = get_sample_list_in_order(get_samples_path('hgdp'),
                                                hgdp_inputs)
        hl.experimental.run_combiner(hgdp_inputs,
                                     out_file=get_reference_mt_path(
                                         'hgdp', sparse=True),
                                     tmp_path=temp_bucket,
                                     overwrite=args.overwrite,
                                     header=get_header_path('hgdp'),
                                     sample_names=sample_names,
                                     use_genome_default_intervals=True)
        tgp_mt = hl.read_matrix_table(get_reference_mt_path('tgp',
                                                            sparse=True))
        tgp_mt = tgp_mt.annotate_entries(
            gvcf_info=tgp_mt.gvcf_info.drop('MQ0', 'VariantType')).drop(
                'AB', 'MQ0')
        hgdp_mt = hl.read_matrix_table(
            get_reference_mt_path('hgdp', sparse=True))
        hgdp_mt = hgdp_mt.annotate_entries(gvcf_info=hgdp_mt.gvcf_info.select(
            *tgp_mt.gvcf_info))
        mt = combine_gvcfs([tgp_mt, hgdp_mt])
        mt.write(get_reference_mt_path(sparse=True), overwrite=args.overwrite)

    if args.densify_mt:
        mt = hl.read_matrix_table(get_reference_mt_path(
            sparse=True)).key_rows_by('locus', 'alleles')
        mt = hl.experimental.densify(hl.experimental.sparse_split_multi(mt))
        mt = mt.filter_rows(hl.len(mt.alleles) > 1)
        mt.naive_coalesce(5000).write(get_reference_mt_path(), args.overwrite)

    mt = hl.read_matrix_table(get_reference_mt_path()).drop('gvcf_info')
    hl.export_vcf(mt,
                  get_reference_mt_path(extension='vcf.bgz'),
                  parallel='header_per_shard')
Beispiel #26
0
def export_vcf(mt_path):
    mt = hl.read_matrix_table(hl.read_matrix_table(mt_path))
    out = hl.utils.new_temp_file(suffix='vcf.bgz')
    hl.export_vcf(mt, out)
with herzog.Cell("python"):
    mt = mt.annotate_rows(info=mt.info.annotate(AC=mt.variant_qc.AC))

with herzog.Cell("markdown"):
    """
    We export the VCF as several files (shards) to speed up the process.
    """

with herzog.Cell("python"):
    start_vcf_write_time = time.time()

with herzog.Cell("python"):
    mt = mt.repartition(25)
    hl.export_vcf(mt,
                  bucket + 'MyProject_MAFgt0.01.vcf.bgz',
                  parallel='header_per_shard')

with herzog.Cell("python"):
    elapsed_vcf_write_time = time.time() - start_vcf_write_time

with herzog.Cell("python"):
    print(timedelta(seconds=elapsed_vcf_write_time))

with herzog.Cell("markdown"):
    """
    Check that these files were successfully loaded to the bucket:
    """

with herzog.Cell("python"):
    get_ipython().system(
               mt.count_rows() * 2)))

    mt = mt.annotate_rows(info=mt.info.annotate(
        cohort_names=mt.MAF_cohorts.keys()))
    mt = mt.annotate_rows(info=mt.info.annotate(
        MAF_cohorts_values=mt.MAF_cohorts.values()))

    mt = mt.annotate_rows(info=mt.info.annotate(
        AN_cohorts_values=mt.AN_cohorts.values()))

    mt = mt.annotate_rows(info=mt.info.annotate(
        AC_cohorts=mt.AC_cohorts.values()))

    mt = mt.annotate_rows(info=mt.info.annotate(
        missingness_cohorts_values=mt.missingness_cohorts.values()))

    mt = mt.checkpoint(
        f'{lustre_dir}/variant_qc/megaWES_mt_with_stats_w20_June_2021.mt',
        overwrite=True)
    hl.export_vcf(
        mt,
        f'{lustre_dir}/variant_qc/megaWES_mt_after_RF_cohort_stats_w20.vcf.bgz',
        parallel='separate_header')

    mt1 = mt.select_entries()
    mt_fin = mt1.filter_cols(mt1['s'] == 'sample')
    hl.export_vcf(
        mt_fin,
        f"{lustre_dir}/variant_qc/megaWES_mt_after_RF_stats_for_VEP_w20.vcf.bgz",
        parallel='separate_header')
Beispiel #29
0
def main(
    json_str: str,
    dataset: str,
    reference: Optional[str],
    multi_fam: bool,
    skip_mt: bool,
    skip_vcf: bool,
):
    """
    This takes the family structures encoded in the JSON str and creates a number
    of single-family objects in Test

    The option to include a multi-family structure is useful for experimenting with
    operations mapping MOI patterns per-family within a larger dataset.
    This will be useful in analysing runtime/cost of analysing a single family with
    and without extracting from a larger dataset first

    Additional options permit the skipping of either the VCF or MT for each family subset
    """

    # parse the families dict from the input string, e.g. '{'fam1':['sam1','sam2']}'
    families_dict = json.loads(json_str)

    gcp_test_bucket = f'gs://cpg-{dataset}-test'

    # path built in this way to pass separate components to the GCP file check
    gcp_main_bucket = f'gs://cpg-{dataset}-main'
    mt_in_bucket = os.path.join('mt', f'{dataset}.mt')
    gcp_mt_full = os.path.join(gcp_main_bucket, mt_in_bucket)

    # collect all unique sample IDs for a single filter on the MT
    all_samples = get_all_unique_members(families_dict)

    mt = read_mt(gcp_mt_full, reference=reference)

    if multi_fam:
        # pull all samples from all requested families
        multi_fam_mt = obtain_mt_subset(mt, list(all_samples))
        # force-write this family MT to a test location
        multi_fam_mt.write(
            os.path.join(gcp_test_bucket, 'multiple_families.mt'),
            overwrite=True
        )

    # check samples all present
    check_samples_in_mt(all_samples, families_dict, mt)

    # for each family, dump both a small MT and a VCF containing the same samples/variants
    for family, samples in families_dict.items():

        # pull out only this family's samples from the MT
        family_mt = obtain_mt_subset(mt, samples)

        if not skip_mt:
            # write this family MT to a test location
            family_mt.write(os.path.join(gcp_test_bucket, f'{family}.mt'))

        if not skip_vcf:
            # revert to a VCF file format, and write to a test location
            # hail generic method, so the MT is an argument
            hl.export_vcf(family_mt, os.path.join(gcp_test_bucket, f'{family}.vcf.bgz'))
Beispiel #30
0
    #Now need to filter rows again by doing a new variant_qc.
    #Drop the previous variant_qc and sample_qc
    print(
        "8. Drop old sample and variant QC and calculate new ones after filtering"
    )

    fields_to_drop = ['variant_QC_Hail', 'sample_QC_Hail']

    mt1 = mt_entries_filtered.drop(*fields_to_drop)

    mt2 = hl.sample_qc(mt1, name='sample_QC_Hail')
    mt3 = hl.variant_qc(mt2, name='variant_QC_Hail')

    #Remove variants with missingness > 3%
    mt = mt3.filter_rows(mt3.variant_QC_Hail.call_rate >= 0.97)

    print("Finished filtering. Now writing out.")

    #8.Export VCF only variants -no genotypes
    print("Export VCF only variants -no genotypes")
    mt1 = mt.select_entries()
    mt2 = mt1.filter_cols(mt1['s'] == 'samplenone')
    hl.export_vcf(mt2, f"{tmp_dir}/intervalwes/VCFs/shard1.vcf.bgz")

    #9. Write matrixtable
    print("Write matrixtable")
    mt = mt.checkpoint(f"{tmp_dir}/intervalwes/shard1_filtered.mt",
                       overwrite=True)
    print("Finished writing mt. ")
#this is a python script loosely based on Kumar and Konrad's effort here: https://github.com/mkveerapen/covid19_sequencing
#again, some of the QC at our institution was done by our genome center, and therefore you should refer to the above link for more thorough QC
#specifically, variant recalibration should still be done, even if not shown here, can discuss with me on how to do it using gatk.

import hail as hl

#tmp_dir is where some of the temporary computations are done. I would make sure to assign it to a folder that does not have a strict data cap.
hl.init(spark_conf=None, tmp_dir='/path/to/tmp_dir/')

#import the data and sample QC
hl.import_vcf('/path/to/sequence.file.normID.noChrM.vcf.gz',
              min_partitions=4,
              reference_genome='GRCh38',
              force_bgz=True).write('/hailFiles/hail.full.normID.noChrM.mt',
                                    overwrite=True)

mtAll = hl.read_matrix_table('/hailFiles/hail.full.noChrM.mt')
mtAll = mtAll.annotate_entries(AB=(mtAll.AD[1] / hl.sum(mtAll.AD)))
mtAll = hl.sample_qc(mtAll)
mtAll = mtAll.filter_cols((mtAll.sample_qc.call_rate >= 0.97)
                          & (mtAll.sample_qc.dp_stats.mean >= 20))
mtAll = mtAll.filter_entries((mtAll.GQ >= 20) & (mtAll.DP >= 10) & (
    (mtAll.GT.is_hom_ref() & (mtAll.AB <= 0.1))
    | (mtAll.GT.is_het() & (mtAll.AB >= 0.25) & (mtAll.AB <= 0.75))
    | (mtAll.GT.is_hom_var() & (mtAll.AB >= 0.9))))

hl.export_vcf(mtAll, '/path/to/sequence.file.normID.GTflt.AB.noChrM.vcf.gz')
Beispiel #32
0
                        sample_file=ukb_sf,
                        index_file_map=file_map,
                        _row_fields=['rsid'])
    # Extracting SNPs of interest
    mt_f = hl.filter_intervals(mt, ploci)
    mt_f = hl.variant_qc(mt_f)
    chromdat['chrompos'] = chromdat['chrom'] + ':' + chromdat[
        'hg19_pos'].astype(str)
    chromdat_hl = hl.Table.from_pandas(chromdat)
    chromdat_hl = chromdat_hl.annotate(
        locus=hl.parse_locus(chromdat_hl.chrompos, reference_genome='GRCh37'))
    chromdat_hl = chromdat_hl.key_by('locus')
    mt_f = mt_f.annotate_rows(**chromdat_hl[mt_f.locus])
    flip = hl.case().when(mt_f.ea == mt_f.alleles[0],
                          True).when(mt_f.ea == mt_f.alleles[1],
                                     False).or_missing()
    mt_f = mt_f.annotate_rows(flip=flip)
    mt_f = mt_f.annotate_rows(
        prior=2 *
        hl.if_else(mt_f.flip, mt_f.variant_qc.AF[0], mt_f.variant_qc.AF[1]))
    mt_f = mt_f.select_entries(G=hl.coalesce(
        hl.if_else(mt_f.flip, 2 - mt_f.GT.n_alt_alleles(),
                   mt_f.GT.n_alt_alleles()), mt_f.prior))
    ## Exporting result
    output = '/ludc/Home/daniel_c/dva/files/ukbgeno/chrom{}.vcf.bgz'.format(ch)
    hl.export_vcf(mt_f, output)

## Removing log file
logfile = glob.glob('*.log')
os.remove(logfile[0])
Beispiel #33
0
#! /usr/bin/python

import sys
import hail as hl

n_samples = int(sys.argv[1])
n_variants = int(sys.argv[2])
path = sys.argv[3]

mt = hl.balding_nichols_model(1, n_samples, n_variants)
mt = mt.key_cols_by(s = hl.str(mt.sample_idx))
mt = mt.annotate_entries(GT = hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2))
hl.export_vcf(mt, path + ".vcf")
hl.export_plink(mt, path)
        mt.cohort,
        hl.min((hl.agg.count_where(hl.is_missing(mt['GT']))) /
               mt.count_rows() * 2)))

    mt = mt.annotate_rows(info=mt.info.annotate(
        cohort_names=mt.MAF_cohorts.keys()))
    mt = mt.annotate_rows(info=mt.info.annotate(
        MAF_cohorts_values=mt.MAF_cohorts.values()))

    mt = mt.annotate_rows(info=mt.info.annotate(
        AN_cohorts_values=mt.AN_cohorts.values()))

    mt = mt.annotate_rows(info=mt.info.annotate(
        AC_cohorts=mt.AC_cohorts.values()))

    mt = mt.annotate_rows(info=mt.info.annotate(
        missingness_cohorts_values=mt.missingness_cohorts.values()))

    # mt = mt.checkpoint(
    #    f'{tmp_dir}/Sanger_WES_mt_with_stats.mt', overwrite=True)
    hl.export_vcf(
        mt,
        f'{tmp_dir}/Sanger_WES_chr1-7and20_after_RF_cohort_stats.vcf.bgz',
        parallel='separate_header')

    mt1 = mt.select_entries()
    mt_fin = mt1.filter_cols(mt1['s'] == 'sample')
    hl.export_vcf(mt_fin,
                  f"{tmp_dir}/Sanger_WES_chr1-7and20_stats_for_VEP.vcf.bgz",
                  parallel='separate_header')