Esempio n. 1
0
 def min_rep(locus, ref, alt):
     mr = hl.min_rep(locus, [ref, alt])
     return (hl.case().when(
         alt == '<NON_REF>', hl.struct(ref=ref[0:1], alt=alt)).when(
             locus == mr.locus,
             hl.struct(ref=mr.alleles[0], alt=mr.alleles[1])).or_error(
                 "locus before and after minrep differ"))
def import_vcf(vcf_path: str,
               genome_version: str,
               min_partitions: int = None,
               force_bgz: bool = True,
               drop_samples: bool = False,
               skip_invalid_loci: bool = False,
               split_multi_alleles: bool = True):
    """Import vcf and return MatrixTable.

    :param str vcf_path: MT to annotate with VEP
    :param str genome_version: "37" or "38"
    :param int min_partitions: min partitions
    :param bool force_bgz: read .gz as a bgzipped file
    :param bool drop_samples: if True, discard genotype info
    :param bool skip_invalid_loci: if True, skip loci that are not consistent with the reference_genome.
    """

    if genome_version not in ("37", "38"):
        raise ValueError(f"Invalid genome_version: {genome_version}")

    logger.info(f"\n==> import vcf: {vcf_path}")

    # add (or remove) "chr" prefix from vcf chroms so they match the reference
    ref = hl.get_reference(f"GRCh{genome_version}")
    contig_recoding = {
        **{
            ref_contig.replace("chr", ""): ref_contig
            for ref_contig in ref.contigs if "chr" in ref_contig
        },
        **{
            f"chr{ref_contig}": ref_contig
            for ref_contig in ref.contigs if "chr" not in ref_contig
        }
    }

    mt = hl.import_vcf(vcf_path,
                       reference_genome=f"GRCh{genome_version}",
                       contig_recoding=contig_recoding,
                       min_partitions=min_partitions,
                       force_bgz=force_bgz,
                       drop_samples=drop_samples,
                       skip_invalid_loci=skip_invalid_loci)

    mt = mt.annotate_globals(sourceFilePath=vcf_path,
                             genomeVersion=genome_version)

    mt = mt.annotate_rows(original_alt_alleles=hl.or_missing(
        hl.len(mt.alleles) > 2, get_expr_for_variant_ids(mt.locus,
                                                         mt.alleles)))

    if split_multi_alleles:
        mt = hl.split_multi_hts(mt)
        mt = mt.key_rows_by(**hl.min_rep(mt.locus, mt.alleles))

    return mt
def import_vcf(
        vcf_path: Union[str,List[str]],
        genome_version: str,
        sample_name: str,
        # Exomes VCFs can be split to ~1mb chunks for annotation (good for joins)
        # but they are pretty tiny and I think too big is bad.
        min_partitions: int = 50,
        force_bgz: bool = False,
        drop_samples: bool = False,
        skip_invalid_loci: bool = False,
        split_multi_alleles: bool = True):
    """Import vcf and return MatrixTable.

    :param str vcf_path: MT to annotate with VEP
    :param str genome_version: "37" or "38"
    :param int min_partitions: min partitions
    :param bool force_bgz: read .gz as a bgzipped file
    :param bool drop_samples: if True, discard genotype info
    :param bool skip_invalid_loci: if True, skip loci that are not consistent with the reference_genome.
    """
    if genome_version not in ("37", "38"):
        raise ValueError(f"Invalid genome_version: {genome_version}")
#    logger.info(f"\n==> import vcf: {vcf_path}")
    # add (or remove) "chr" prefix from vcf chroms so they match the reference
    ref = hl.get_reference(f"GRCh{genome_version}")
    contig_recoding = {
        **{ref_contig.replace("chr", ""): ref_contig for ref_contig in ref.contigs if "chr" in ref_contig},
        **{f"chr{ref_contig}": ref_contig for ref_contig in ref.contigs if "chr" not in ref_contig}}
    mt = hl.import_vcf(
        vcf_path,
        reference_genome=f"GRCh{genome_version}",
        contig_recoding=contig_recoding,
        min_partitions=min_partitions,
        force_bgz=force_bgz,
        drop_samples=drop_samples,
        skip_invalid_loci=skip_invalid_loci,
        array_elements_required=False)
    valid_chros = {'18', '14', '17', 'Y', '2', '8', 'X', '22', '16', '21', '3', '6', '10', '5', '13', '7', '1', '11', '19', 'MT', '4', '12', '9', '20', '15'}
    mt = mt.filter_rows(hl.literal(valid_chros).contains(mt.locus.contig))
    mt.write("/vep/tmpck1.mt",overwrite=True)
    mt = hl.read_matrix_table("/vep/tmpck1.mt")
    mt = mt.annotate_globals(sourceFilePath=vcf_path, genomeVersion=genome_version)
    mt = mt.annotate_rows(
        originalAltAlleles=hl.or_missing(hl.len(mt.alleles) > 2, get_expr_for_variant_ids(mt.locus, mt.alleles))
    )
    mt = mt.annotate_rows(
        xpos=get_expr_for_xpos(mt.locus)
    )
    mt = mt.annotate_rows(
        ref=mt.alleles[0]
    )
    if split_multi_alleles:
        mt = hl.split_multi_hts(mt)
        mt = mt.key_rows_by(**hl.min_rep(mt.locus, mt.alleles))
    return mt
Esempio n. 4
0
 def struct_from_min_rep(i):
     return hl.bind(
         lambda mr:
         (hl.case().when(
             ds.locus == mr.locus,
             hl.struct(locus=ds.locus,
                       alleles=[mr.alleles[0], mr.alleles[1]],
                       a_index=i,
                       was_split=True)).
          or_error("Found non-left-aligned variant in sparse_split_multi")),
         hl.min_rep(ds.locus, [ds.alleles[0], ds.alleles[i]]))
Esempio n. 5
0
 def struct_from_min_rep(i):
     return hl.bind(
         lambda mr:
         (hl.case().
          when(
              ds.locus == mr.locus,
              hl.struct(locus=ds.locus,
                        alleles=[mr.alleles[0], mr.alleles[1]],
                        a_index=i,
                        was_split=True)).when(
                            filter_changed_loci,
                            hl.null(
                                hl.tstruct(locus=ds.locus.dtype,
                                           alleles=hl.tarray(hl.tstr),
                                           a_index=hl.tint,
                                           was_split=hl.tbool))).
          or_error("Found non-left-aligned variant in sparse_split_multi\n"
                   + "old locus: " + hl.str(ds.locus) + "\n" + "old ref  : "
                   + ds.alleles[0] + "\n" + "old alt  : " + ds.alleles[
                       i] + "\n" + "mr locus : " + hl.str(
                           mr.locus) + "\n" + "mr ref   : " + mr.alleles[
                               0] + "\n" + "mr alt   : " + mr.alleles[1])),
         hl.min_rep(ds.locus, [ds.alleles[0], ds.alleles[i]]))
Esempio n. 6
0
 def test_min_rep(self):
     # FIXME actually test
     ds = self.get_dataset()
     hl.min_rep(ds).count()
def create_cnn_rank_file() -> None:
    """
    Creates a rank file for the CNN data and writes it to its correct location.

    :return: Nothing
    :rtype: None
    """

    logger.info("Creating CNN rank file.")

    if not hl.utils.hadoop_exists(
            'gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht/_SUCCESS'):
        logger.info(f"Importing CNN scores")
        ht = hl.import_table(
            'gs://gnomad/variant_qc/temp/friedman_cnn_scores.tsv.bgz',
            min_partitions=1000,
            impute=True)
        ht.write('gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht',
                 overwrite=True)

    logger.info('Formatting CNN scores...')
    ht = hl.read_table('gs://gnomad/variant_qc/temp/friedman_cnn_scores.ht')
    ht = ht.annotate(
        alt_alleles=ht.Alt.split(','),
        was_split=ht.Alt.contains(','))  # This transforms to a list
    ht = ht.explode('alt_alleles')
    ht = ht.annotate(locus=hl.locus(hl.str(ht.Contig), ht.Pos))

    # Minrep
    ht = ht.annotate(**hl.min_rep(ht.locus, [ht.Ref, ht.alt_alleles]))
    ht = ht.annotate(vartype=add_variant_type(ht.alleles))
    ht = ht.key_by('locus', 'alleles')

    # Select relevant annotations and add singleton / n_nonref, was_split
    gnomad_ht = get_gnomad_annotations('genomes')
    ht = ht.select(
        **gnomad_ht[ht.key],
        variant_type=ht.vartype.variant_type,
        n_alt_alleles=ht.vartype.n_alt_alleles,
        score=ht.CNN_1D_Score,
    )
    ht = ht.filter(ht.n_nonref > 0)
    ht = add_rank(ht,
                  score_expr=-1 * ht.score,
                  subrank_expr={
                      'singleton_rank':
                      ht.singleton,
                      'biallelic_rank':
                      ~ht.was_split,
                      'biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton,
                      'adj_rank':
                      ht.ac > 0,
                      'adj_biallelic_rank':
                      ~ht.was_split & (ht.ac > 0),
                      'adj_singleton_rank':
                      ht.singleton & (ht.ac > 0),
                      'adj_biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton & (ht.ac > 0)
                  })

    ht.write(score_ranking_path('genomes', 'cnn'), overwrite=True)
Esempio n. 8
0
import hail as hl

hl.init(sc=sc)

# Note: all files were moved to HDFS on spark-c002 to speed along imports (now at 18 hours on 100 cores)

meta_path = 'gnomad_meta/metadata_import_table_intermediate_oct_09_2017.txt'
vcfs_path = 'gnomad_exomes/scattered/*/genotypes.unfiltered.vcf.gz'
out_vds_path = 'gnomad.vds'
vqsr_file = 'gnomad_meta/ExAC.merged.sites_only.vcf.ICfiltered.recalibrated.vcf.bgz'

meta_kt = hl.import_table(meta_path, impute=True).key_by('sample')
vqsr_vds = hl.import_vcf(vqsr_file)

vds = hl.import_vcf(vcfs_path,
                    call_fields=['GT', 'PGT'],
                    force_bgz=True,
                    header_file='gnomad_exomes/gnomad_exomes_header_fixed.vcf')

vds = vds.filter_cols(meta_kt[vds.s].cloudable)
vds = vds.annotate_rows(info=vqsr_vds[(vds.locus, vds.alleles), :].info)
vds = hl.min_rep(vds, left_aligned=True)
vds.write(out_vds_path)

# To copy to cloud:
# cd /humgen/atgu1/fs03/konradk/gnomad/2.1/hail
# hadoop distcp -libjars gcs-connector-latest-hadoop2.jar -conf core-site.xml -m 50 hdfs:///user/konradk/gnomad.vds gs://gnomad/raw/hail-0.2/vds/exomes/gnomad.exomes.vds
Esempio n. 9
0
 def compute_variant_id(alt):
     return hl.rbind(
         hl.min_rep(locus, [alleles[0], alt]), lambda min_rep: variant_id(
             min_rep.locus, min_rep.alleles, max_length))