Ejemplo n.º 1
0
def test_vcf_vds_combiner_equivalence():
    import hail.experimental.vcf_combiner.vcf_combiner as vcf
    import hail.vds.combiner as vds
    _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz']
    paths = [resource(p) for p in _paths]
    parts = [
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')),
                    includes_end=True),
        hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')),
                    end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')),
                    includes_end=True)
    ]
    vcfs = [mt.annotate_rows(info=mt.info.annotate(
        MQ_DP=hl.missing(hl.tint32),
        VarDP=hl.missing(hl.tint32),
        QUALapprox=hl.missing(hl.tint32)))
            for mt in hl.import_gvcfs(paths, parts, reference_genome='GRCh38',
                                      array_elements_required=False)]
    entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'}
    vds = vds.combine_variant_datasets([vds.transform_gvcf(mt, reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs])
    smt = vcf.combine_gvcfs([vcf.transform_gvcf(mt) for mt in vcfs])
    smt_from_vds = hl.vds.to_merged_sparse_mt(vds).drop('RGQ')
    smt = smt.select_entries(*smt_from_vds.entry)  # harmonize fields and order
    smt = smt.key_rows_by('locus', 'alleles')
    assert smt._same(smt_from_vds)
Ejemplo n.º 2
0
def compile_2k_merge(path):
    vcf = setup(path)
    vcfs = [vc_all.transform_gvcf(vcf)] * COMBINE_GVCF_MAX
    combined = [vc_all.combine_gvcfs(vcfs)] * 20
    with TemporaryDirectory() as tmpdir:
        hl.experimental.write_matrix_tables(combined,
                                            os.path.join(
                                                tmpdir,
                                                'combiner-multi-write'),
                                            overwrite=True)
Ejemplo n.º 3
0
def compile_2k_merge(path):
    flagname = 'no_ir_logging'
    prev_flag_value = hl._get_flags(flagname).get(flagname)
    try:
        hl._set_flags(**{flagname: '1'})
        vcf = setup(path)
        vcfs = [vc_all.transform_gvcf(vcf)] * COMBINE_GVCF_MAX
        combined = [vc_all.combine_gvcfs(vcfs)] * 20
        with TemporaryDirectory() as tmpdir:
            hl.experimental.write_matrix_tables(combined,
                                                os.path.join(
                                                    tmpdir,
                                                    'combiner-multi-write'),
                                                overwrite=True)
    finally:
        hl._set_flags(**{flagname: prev_flag_value})
Ejemplo n.º 4
0
def main(args):
    hl.init(default_reference='GRCh38')

    hgdp_inputs = []
    tgp_inputs = []
    with hl.hadoop_open(
            'gs://hgdp_tgp/misc/tgp_plus_hgdp_30x_reblocked_gvcfs.txt',
            'r') as f:
        for line in f:
            line = line.strip()
            hgdp_inputs.append(line) if 'HGDP' in line else tgp_inputs.append(
                line)

    temp_bucket = 'gs://gnomad-tmp/tgp_hgdp'

    if args.get_sample_names:
        get_sample_names_from_list_of_files(tgp_inputs,
                                            get_samples_path('tgp'))
        get_sample_names_from_list_of_files(hgdp_inputs,
                                            get_samples_path('hgdp'))

    if args.create_sparse_mt:
        sample_names = get_sample_list_in_order(get_samples_path('tgp'),
                                                tgp_inputs)
        hl.experimental.run_combiner(tgp_inputs,
                                     out_file=get_reference_mt_path(
                                         'tgp', sparse=True),
                                     tmp_path=temp_bucket,
                                     overwrite=args.overwrite,
                                     header=get_header_path('tgp'),
                                     sample_names=sample_names,
                                     use_genome_default_intervals=True)
        sample_names = get_sample_list_in_order(get_samples_path('hgdp'),
                                                hgdp_inputs)
        hl.experimental.run_combiner(hgdp_inputs,
                                     out_file=get_reference_mt_path(
                                         'hgdp', sparse=True),
                                     tmp_path=temp_bucket,
                                     overwrite=args.overwrite,
                                     header=get_header_path('hgdp'),
                                     sample_names=sample_names,
                                     use_genome_default_intervals=True)
        tgp_mt = hl.read_matrix_table(get_reference_mt_path('tgp',
                                                            sparse=True))
        tgp_mt = tgp_mt.annotate_entries(
            gvcf_info=tgp_mt.gvcf_info.drop('MQ0', 'VariantType')).drop(
                'AB', 'MQ0')
        hgdp_mt = hl.read_matrix_table(
            get_reference_mt_path('hgdp', sparse=True))
        hgdp_mt = hgdp_mt.annotate_entries(gvcf_info=hgdp_mt.gvcf_info.select(
            *tgp_mt.gvcf_info))
        mt = combine_gvcfs([tgp_mt, hgdp_mt])
        mt.write(get_reference_mt_path(sparse=True), overwrite=args.overwrite)

    if args.densify_mt:
        mt = hl.read_matrix_table(get_reference_mt_path(
            sparse=True)).key_rows_by('locus', 'alleles')
        mt = hl.experimental.densify(hl.experimental.sparse_split_multi(mt))
        mt = mt.filter_rows(hl.len(mt.alleles) > 1)
        mt.naive_coalesce(5000).write(get_reference_mt_path(), args.overwrite)

    mt = hl.read_matrix_table(get_reference_mt_path()).drop('gvcf_info')
    hl.export_vcf(mt,
                  get_reference_mt_path(extension='vcf.bgz'),
                  parallel='header_per_shard')
Ejemplo n.º 5
0
def python_only_10k_combine(path):
    vcf = setup(path)
    mt = vc_all.transform_gvcf(vcf)
    mts = [mt] * 10_000
    _ = [vc_all.combine_gvcfs(mts) for mts in chunks(mts, COMBINE_GVCF_MAX)]
Ejemplo n.º 6
0
def combine_variant_datasets(vdss: List[VariantDataset]) -> VariantDataset:
    reference = combine_references([vds.reference_data for vds in vdss])
    no_variant_key = [vds.variant_data.key_rows_by('locus') for vds in vdss]

    variants = combine_gvcfs(no_variant_key)
    return VariantDataset(reference, variants._key_rows_by_assert_sorted('locus', 'alleles'))