Ejemplo n.º 1
0
def import_and_transform_gvcf(path):
    size = vc_all.CombinerConfig.default_exome_interval_size
    intervals = vc_all.calculate_even_genome_partitioning('GRCh38', size)

    [mt] = hl.import_gvcfs([path], intervals, reference_genome='GRCh38')
    mt = vc_all.transform_gvcf(mt)
    mt._force_count()
Ejemplo n.º 2
0
def default_exome_intervals(rg):
    return vc.calculate_even_genome_partitioning(
        rg, 2**32)  # 4 billion, larger than any contig
def new_combiner(
    *,
    output_path: str,
    temp_path: str,
    save_path: Optional[str] = None,
    gvcf_paths: Optional[List[str]] = None,
    vds_paths: Optional[List[str]] = None,
    vds_sample_counts: Optional[List[int]] = None,
    intervals: Optional[List[Interval]] = None,
    import_interval_size: Optional[int] = None,
    use_genome_default_intervals: bool = False,
    use_exome_default_intervals: bool = False,
    gvcf_external_header: Optional[str] = None,
    gvcf_sample_names: Optional[List[str]] = None,
    gvcf_info_to_keep: Optional[Collection[str]] = None,
    gvcf_reference_entry_fields_to_keep: Optional[Collection[str]] = None,
    branch_factor: int = VariantDatasetCombiner.default_branch_factor,
    target_records: int = VariantDatasetCombiner.default_target_records,
    batch_size: int = VariantDatasetCombiner.default_gvcf_batch_size,
    reference_genome: Union[str, hl.ReferenceGenome] = 'default',
    contig_recoding: Optional[Dict[str, str]] = None,
    force: bool = False,
) -> VariantDatasetCombiner:
    if not (gvcf_paths or vds_paths):
        raise ValueError(
            "at least one  of 'gvcf_paths' or 'vds_paths' must be nonempty")
    if gvcf_paths is None:
        gvcf_paths = []
    if vds_paths is None:
        vds_paths = []
    if vds_sample_counts is not None and len(vds_paths) != len(
            vds_sample_counts):
        raise ValueError(
            "'vds_paths' and 'vds_sample_counts' (if present) must have the same length "
            f'{len(vds_paths)} != {len(vds_sample_counts)}')
    if (gvcf_sample_names is None) != (gvcf_external_header is None):
        raise ValueError(
            "both 'gvcf_sample_names' and 'gvcf_external_header' must be set or unset"
        )
    if gvcf_sample_names is not None and len(gvcf_sample_names) != len(
            gvcf_paths):
        raise ValueError(
            "'gvcf_sample_names' and 'gvcf_paths' must have the same length "
            f'{len(gvcf_sample_names)} != {len(gvcf_paths)}')

    n_partition_args = (int(intervals is not None) +
                        int(import_interval_size is not None) +
                        int(use_genome_default_intervals) +
                        int(use_exome_default_intervals))

    if n_partition_args == 0:
        raise ValueError(
            "'new_combiner': require one argument from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning"
        )

    def maybe_load_from_saved_path(
            save_path: str) -> Optional[VariantDatasetCombiner]:
        if force:
            return None
        fs = hl.current_backend().fs
        if fs.exists(save_path):
            try:
                combiner = load_combiner(save_path)
                warning(
                    f'found existing combiner plan at {save_path}, using it')
                # we overwrite these values as they are serialized, but not part of the
                # hash for an autogenerated name and we want users to be able to overwrite
                # these when resuming a combine (a common reason to need to resume a combine
                # is a failure due to branch factor being too large)
                combiner.branch_factor = branch_factor
                combiner.target_records = target_records
                combiner.gvcf_batch_size = batch_size
                return combiner
            except (ValueError, TypeError, OSError, KeyError):
                warning(
                    f'file exists at {save_path}, but it is not a valid combiner plan, overwriting'
                )
        return None

    # We do the first save_path check now after validating the arguments
    if save_path is not None:
        saved_combiner = maybe_load_from_saved_path(save_path)
        if saved_combiner is not None:
            return saved_combiner

    if n_partition_args > 1:
        warning(
            "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals'."
            "\n  The argument found first in the list in this warning will be used, and others ignored."
        )

    if intervals is not None:
        pass
    elif import_interval_size is not None:
        intervals = calculate_even_genome_partitioning(reference_genome,
                                                       import_interval_size)
    elif use_genome_default_intervals:
        size = VariantDatasetCombiner.default_genome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
    elif use_exome_default_intervals:
        size = VariantDatasetCombiner.default_exome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
    assert intervals is not None

    if isinstance(reference_genome, str):
        reference_genome = hl.get_reference(reference_genome)

    if gvcf_reference_entry_fields_to_keep is None and vds_paths:
        vds = hl.vds.read_vds(vds_paths[0])
        gvcf_reference_entry_fields_to_keep = set(
            vds.reference_data.entry) - {'END'}
    elif gvcf_reference_entry_fields_to_keep is None and gvcf_paths:
        mt = hl.import_vcf(gvcf_paths[0],
                           force_bgz=True,
                           reference_genome=reference_genome)
        mt = mt.filter_rows(hl.is_defined(mt.info.END))
        gvcf_reference_entry_fields_to_keep = defined_entry_fields(
            mt, 100_000) - {'GT', 'PGT', 'PL'}

    if save_path is None:
        sha = hashlib.sha256()
        sha.update(output_path.encode())
        sha.update(temp_path.encode())
        sha.update(str(reference_genome).encode())
        for path in vds_paths:
            sha.update(path.encode())
        for path in gvcf_paths:
            sha.update(path.encode())
        if gvcf_external_header is not None:
            sha.update(gvcf_external_header.encode())
        if gvcf_sample_names is not None:
            for name in gvcf_sample_names:
                sha.update(name.encode())
        if gvcf_info_to_keep is not None:
            for kept_info in sorted(gvcf_info_to_keep):
                sha.update(kept_info.encode())
        if gvcf_reference_entry_fields_to_keep is not None:
            for field in sorted(gvcf_reference_entry_fields_to_keep):
                sha.update(field.encode())
        if contig_recoding is not None:
            for key, value in sorted(contig_recoding.items()):
                sha.update(key.encode())
                sha.update(value.encode())
        for interval in intervals:
            sha.update(str(interval).encode())
        digest = sha.hexdigest()
        name = f'vds-combiner-plan_{digest}_{hl.__pip_version__}.json'
        save_path = os.path.join(temp_path, 'combiner-plans', name)
        saved_combiner = maybe_load_from_saved_path(save_path)
        if saved_combiner is not None:
            return saved_combiner
        else:
            warning(f'generated combiner save path of {save_path}')

    if vds_sample_counts:
        vdses = [
            VDSMetadata(path, n_samples)
            for path, n_samples in zip(vds_paths, vds_sample_counts)
        ]
    else:
        vdses = []
        for path in vds_paths:
            vds = hl.vds.read_vds(path)
            n_samples = vds.n_samples()
            vdses.append(VDSMetadata(path, n_samples))

    vdses.sort(key=lambda x: x.n_samples, reverse=True)

    return VariantDatasetCombiner(
        save_path=save_path,
        output_path=output_path,
        temp_path=temp_path,
        reference_genome=reference_genome,
        branch_factor=branch_factor,
        target_records=target_records,
        gvcf_batch_size=batch_size,
        contig_recoding=contig_recoding,
        vdses=vdses,
        gvcfs=gvcf_paths,
        gvcf_import_intervals=intervals,
        gvcf_external_header=gvcf_external_header,
        gvcf_sample_names=gvcf_sample_names,
        gvcf_info_to_keep=gvcf_info_to_keep,
        gvcf_reference_entry_fields_to_keep=gvcf_reference_entry_fields_to_keep
    )