def test_combiner_plan_round_trip_serialization():
    sample_names = all_samples[:5]
    paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names]
    plan_path = new_temp_file(extension='json')
    out_file = new_temp_file(extension='vds')
    plan = new_combiner(gvcf_paths=paths,
                        output_path=out_file,
                        temp_path=Env.hc()._tmpdir,
                        save_path=plan_path,
                        reference_genome='GRCh38',
                        use_exome_default_intervals=True,
                        branch_factor=2,
                        batch_size=2)
    plan.save()
    plan_loaded = load_combiner(plan_path)
    assert plan == plan_loaded
def vds_combiner_chr22(*paths):
    with TemporaryDirectory() as tmpdir:
        with TemporaryDirectory() as outpath:
            parts = hl.eval([
                hl.parse_locus_interval('chr22:start-end',
                                        reference_genome='GRCh38')
            ])

            from hail.vds.combiner import new_combiner
            combiner = new_combiner(output_path=outpath,
                                    intervals=parts,
                                    temp_path=tmpdir,
                                    gvcf_paths=paths,
                                    reference_genome='GRCh38',
                                    branch_factor=16,
                                    target_records=10000000)
            combiner.run()
def test_combiner_manual_filtration():
    sample_names = all_samples[:2]
    paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names]
    out_file = new_temp_file(extension='vds')
    plan = new_combiner(gvcf_paths=paths,
                        output_path=out_file,
                        temp_path=Env.hc()._tmpdir,
                        reference_genome='GRCh38',
                        use_exome_default_intervals=True,
                        gvcf_reference_entry_fields_to_keep=['GQ'],
                        gvcf_info_to_keep=['ExcessHet'],
                        force=True)

    assert plan.gvcf_info_to_keep == {'ExcessHet'}

    plan.run()
    vds = hl.vds.read_vds(out_file)
    assert list(vds.variant_data.gvcf_info) == ['ExcessHet']
    assert list(vds.reference_data.entry) == ['END', 'GQ']