def step0_init_and_run_vep(hc, vds, args):
    if args.start_with_step > 0:
        return hc, vds

    logger.info(
        "\n\n=============================== pipeline - step 0 - run vep ==============================="
    )

    vds = read_in_dataset(
        hc,
        input_path=args.input_dataset.rstrip("/"),
        dataset_type=args.dataset_type,
        filter_interval=args.filter_interval,
        skip_summary=False,
        num_partitions=args.cpu_limit,
        not_gatk_genotypes=args.not_gatk_genotypes,
    )

    validate_dataset(hc, vds, args)

    vds = remap_samples(hc, vds, args)
    vds = subset_samples(hc, vds, args)

    vds = add_global_metadata(vds, args)

    if not args.skip_vep:

        vds = run_vep(vds,
                      genome_version=args.genome_version,
                      block_size=args.vep_block_size)
        vds = vds.annotate_global_expr('global.gencodeVersion = "{}"'.format(
            "19" if args.genome_version == "37" else "25"))

    if args.step0_output_vds != args.input_dataset.rstrip(
            "/") and not args.skip_writing_intermediate_vds:
        write_vds(vds, args.step0_output_vds)

    if args.export_vcf:
        logger.info("Writing out to VCF...")
        vds.export_vcf(args.step0_output_vcf, overwrite=True)

    args.start_with_step = 1  # step 0 finished, so, if an error occurs and it goes to retry, start with the next step

    return hc, vds
Ejemplo n.º 2
0
    "Elasticsearch index name. If specified, only this index will be updated.")
p.add_argument(
    "--dataset-path",
    help="(optional) Path of variant callset. If not specified, the original "
    "vcf/vds path from which the data was loaded will be used.")
p.add_argument("--genome-version",
               help="Genome build: 37 or 38",
               choices=["37", "38"])

p.add_argument(
    "--all",
    help="Update all elasticsearch indices. This option is mutually-exclusive "
    "with --index-name, --dataset-path, and --genome-version.",
    action="store_true")

args = p.parse_args()

hc = create_hail_context()

if args.download_latest_clinvar_vcf:
    for genome_version in ["37", "38"]:
        vds = download_and_import_latest_clinvar_vcf(hc, genome_version)
        write_vds(vds, CLINVAR_VDS_PATH.format(genome_version=genome_version))

if args.index_name and not args.all:
    update_dataset(hc, args.index_name, args)
elif args.all:
    update_all_datasets(hc, args)
else:
    p.exit("ERROR: must specify either --index-name or --all")
if args.dataset_path.endswith(".vds"):
    vds = hc.read(args.dataset_path)
else:
    vds = hc.import_vcf(args.dataset_path,
                        force_bgz=True,
                        min_partitions=10000)

if args.chrom:
    interval = hail.Interval.parse('%s:1-500000000' % str(args.chrom))
else:
    if args.genome_version == "37":
        interval = hail.Interval.parse('X:31224000-31228000')
    elif args.genome_version == "38":
        interval = hail.Interval.parse('X:31205883-31209883')
    else:
        p.error("Unexpected genome version: " + str(args.genome_version))

vds = vds.filter_intervals(interval)

print("\n==> split_multi")
vds = vds.annotate_variants_expr("va.originalAltAlleles=%s" %
                                 get_expr_for_orig_alt_alleles_set())
vds = vds.split_multi()
print("")
pprint(vds.variant_schema)

print("\n==> summary: %s" % str(vds.summarize()))

write_vds(vds, output_path)
def step3_add_reference_datasets(hc, vds, args):
    if args.start_with_step > 3 or args.stop_after_step < 3:
        return hc, vds

    logger.info(
        "\n\n=============================== pipeline - step 3 - add reference datasets ==============================="
    )

    if vds is None or not args.skip_writing_intermediate_vds:
        stop_hail_context(hc)
        hc = create_hail_context()
        vds = read_in_dataset(hc,
                              args.step1_output_vds,
                              dataset_type=args.dataset_type,
                              skip_summary=True)

    if not args.only_export_to_elasticsearch_at_the_end:

        vds = compute_minimal_schema(vds, args.dataset_type)

    if args.dataset_type == "VARIANTS":
        # annotate with the combined reference data file which was generated using
        # ../download_and_create_reference_datasets/v01/hail_scripts/combine_all_variant_level_reference_data.py
        # and contains all these annotations in one .vds

        if not (args.exclude_dbnsfp or args.exclude_cadd or args.exclude_1kg
                or args.exclude_exac or args.exclude_topmed or args.exclude_mpc
                or args.exclude_gnomad or args.exclude_eigen
                or args.exclude_primate_ai or args.exclude_splice_ai):

            logger.info("\n==> add combined variant-level reference data")
            vds = add_combined_reference_data_to_vds(
                hc, vds, args.genome_version, subset=args.filter_interval)

        else:
            # annotate with each reference data file - one-by-one
            if not args.skip_annotations and not args.exclude_dbnsfp:
                logger.info("\n==> add dbnsfp")
                vds = add_dbnsfp_to_vds(hc,
                                        vds,
                                        args.genome_version,
                                        root="va.dbnsfp",
                                        subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_cadd:
                logger.info("\n==> add cadd")
                vds = add_cadd_to_vds(hc,
                                      vds,
                                      args.genome_version,
                                      root="va.cadd",
                                      subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_1kg:
                logger.info("\n==> add 1kg")
                vds = add_1kg_phase3_to_vds(hc,
                                            vds,
                                            args.genome_version,
                                            root="va.g1k",
                                            subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_exac:
                logger.info("\n==> add exac")
                vds = add_exac_to_vds(hc,
                                      vds,
                                      args.genome_version,
                                      root="va.exac",
                                      subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_topmed:
                logger.info("\n==> add topmed")
                vds = add_topmed_to_vds(hc,
                                        vds,
                                        args.genome_version,
                                        root="va.topmed",
                                        subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_mpc:
                logger.info("\n==> add mpc")
                vds = add_mpc_to_vds(hc,
                                     vds,
                                     args.genome_version,
                                     root="va.mpc",
                                     subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_gnomad:
                logger.info("\n==> add gnomad exomes")
                vds = add_gnomad_to_vds(hc,
                                        vds,
                                        args.genome_version,
                                        exomes_or_genomes="exomes",
                                        root="va.gnomad_exomes",
                                        subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_gnomad:
                logger.info("\n==> add gnomad genomes")
                vds = add_gnomad_to_vds(hc,
                                        vds,
                                        args.genome_version,
                                        exomes_or_genomes="genomes",
                                        root="va.gnomad_genomes",
                                        subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_eigen:
                logger.info("\n==> add eigen")
                vds = add_eigen_to_vds(hc,
                                       vds,
                                       args.genome_version,
                                       root="va.eigen",
                                       subset=args.filter_interval)

            if not args.exclude_primate_ai:
                logger.info("\n==> add primate_ai")
                vds = add_primate_ai_to_vds(hc,
                                            vds,
                                            args.genome_version,
                                            root="va.primate_ai",
                                            subset=args.filter_interval)

            if not args.exclude_splice_ai:
                logger.info("\n==> add splice_ai")
                vds = add_splice_ai_to_vds(hc,
                                           vds,
                                           args.genome_version,
                                           root="va.splice_ai",
                                           subset=args.filter_interval)

    if not args.skip_annotations and not args.exclude_clinvar:
        logger.info("\n==> add clinvar")
        vds = add_clinvar_to_vds(hc,
                                 vds,
                                 args.genome_version,
                                 root="va.clinvar",
                                 subset=args.filter_interval)

    if not args.skip_annotations and not args.exclude_hgmd:
        logger.info("\n==> add hgmd")
        vds = add_hgmd_to_vds(hc,
                              vds,
                              args.genome_version,
                              root="va.hgmd",
                              subset=args.filter_interval)

    if not args.is_running_locally and not args.skip_writing_intermediate_vds:
        write_vds(vds, args.step3_output_vds)

    args.start_with_step = 4  # step 3 finished, so, if an error occurs and it goes to retry, start with the next step

    return hc, vds
def step1_compute_derived_fields(hc, vds, args):
    if args.start_with_step > 1 or args.stop_after_step < 1:
        return hc, vds

    logger.info(
        "\n\n=============================== pipeline - step 1 - compute derived fields ==============================="
    )

    if vds is None or not args.skip_writing_intermediate_vds:
        stop_hail_context(hc)
        hc = create_hail_context()
        vds = read_in_dataset(hc,
                              args.step0_output_vds,
                              dataset_type=args.dataset_type,
                              skip_summary=True,
                              num_partitions=args.cpu_limit)

    parallel_computed_annotation_exprs = [
        "va.docId = %s" % get_expr_for_variant_id(512),
        "va.variantId = %s" % get_expr_for_variant_id(),
        "va.variantType= %s" % get_expr_for_variant_type(),
        "va.contig = %s" % get_expr_for_contig(),
        "va.pos = %s" % get_expr_for_start_pos(),
        "va.start = %s" % get_expr_for_start_pos(),
        "va.end = %s" % get_expr_for_end_pos(),
        "va.ref = %s" % get_expr_for_ref_allele(),
        "va.alt = %s" % get_expr_for_alt_allele(),
        "va.xpos = %s" % get_expr_for_xpos(pos_field="start"),
        "va.xstart = %s" % get_expr_for_xpos(pos_field="start"),
        "va.sortedTranscriptConsequences = %s" %
        get_expr_for_vep_sorted_transcript_consequences_array(
            vep_root="va.vep",
            include_coding_annotations=True,
            add_transcript_rank=bool(args.use_nested_objects_for_vep)),
    ]

    if args.dataset_type == "VARIANTS":
        FAF_CONFIDENCE_INTERVAL = 0.95  # based on https://macarthurlab.slack.com/archives/C027LHMPP/p1528132141000430

        parallel_computed_annotation_exprs += [
            "va.FAF = %s" % get_expr_for_filtering_allele_frequency(
                "va.info.AC[va.aIndex - 1]", "va.info.AN",
                FAF_CONFIDENCE_INTERVAL),
        ]

    serial_computed_annotation_exprs = [
        "va.xstop = %s" %
        get_expr_for_xpos(field_prefix="va.", pos_field="end"),
        "va.transcriptIds = %s" % get_expr_for_vep_transcript_ids_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.domains = %s" % get_expr_for_vep_protein_domains_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.transcriptConsequenceTerms = %s" %
        get_expr_for_vep_consequence_terms_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.mainTranscript = %s" %
        get_expr_for_worst_transcript_consequence_annotations_struct(
            "va.sortedTranscriptConsequences"),
        "va.geneIds = %s" % get_expr_for_vep_gene_ids_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.codingGeneIds = %s" % get_expr_for_vep_gene_ids_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences",
            only_coding_genes=True),
    ]

    # serial_computed_annotation_exprs += [
    #   "va.sortedTranscriptConsequences = va.sortedTranscriptConsequences.map(c => drop(c, amino_acids, biotype))"
    #]

    if not bool(args.use_nested_objects_for_vep):
        serial_computed_annotation_exprs += [
            "va.sortedTranscriptConsequences = json(va.sortedTranscriptConsequences)"
        ]

    vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs)

    for expr in serial_computed_annotation_exprs:
        vds = vds.annotate_variants_expr(expr)

    pprint(vds.variant_schema)

    INPUT_SCHEMA = {}
    if args.dataset_type == "VARIANTS":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            variantId: String,
            originalAltAlleles: Set[String],

            contig: String,
            start: Int,
            pos: Int,
            end: Int,
            ref: String,
            alt: String,

            xpos: Long,
            xstart: Long,
            xstop: Long,

            rsid: String,
            --- qual: Double,
            filters: Set[String],
            aIndex: Int,

            geneIds: Set[String],
            transcriptIds: Set[String],
            codingGeneIds: Set[String],
            domains: Set[String],
            transcriptConsequenceTerms: Set[String],
            sortedTranscriptConsequences: String,
            mainTranscript: Struct,
        """

        if args.not_gatk_genotypes:
            INPUT_SCHEMA["info_fields"] = """
                AC: Array[Int],
                AF: Array[Double],
                AN: Int,
                --- BaseQRankSum: Double,
                --- ClippingRankSum: Double,
                --- DP: Int,
                --- FS: Double,
                --- InbreedingCoeff: Double,
                --- MQ: Double,
                --- MQRankSum: Double,
                --- QD: Double,
                --- ReadPosRankSum: Double,
                --- VQSLOD: Double,
                --- culprit: String,
            """
        else:
            INPUT_SCHEMA["info_fields"] = """
                AC: Array[Int],
                AF: Array[Double],
                AN: Int,
                --- BaseQRankSum: Double,
                --- ClippingRankSum: Double,
                --- DP: Int,
                --- FS: Double,
                --- InbreedingCoeff: Double,
                --- MQ: Double,
                --- MQRankSum: Double,
                --- QD: Double,
                --- ReadPosRankSum: Double,
                --- VQSLOD: Double,
                --- culprit: String,
            """
    elif args.dataset_type == "SV":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            variantId: String,

            contig: String,
            start: Int,
            pos: Int,
            end: Int,
            ref: String,
            alt: String,

            xpos: Long,
            xstart: Long,
            xstop: Long,

            rsid: String,
            --- qual: Double,
            filters: Set[String],
            aIndex: Int,
            
            geneIds: Set[String],
            transcriptIds: Set[String],
            codingGeneIds: Set[String],
            domains: Set[String],
            transcriptConsequenceTerms: Set[String],
            sortedTranscriptConsequences: String,
            mainTranscript: Struct,
        """

        # END=100371979;SVTYPE=DEL;SVLEN=-70;CIGAR=1M70D	GT:FT:GQ:PL:PR:SR
        INPUT_SCHEMA["info_fields"] = """
            IMPRECISE: Boolean,
            SVTYPE: String,
            SVLEN: Int,
            END: Int,
            --- OCC: Int,
            --- FRQ: Double,
        """
    else:
        raise ValueError("Unexpected dataset_type: %s" % args.dataset_type)

    if args.exclude_vcf_info_field:
        INPUT_SCHEMA["info_fields"] = ""

    expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean",
                                                               **INPUT_SCHEMA)

    vds = vds.annotate_variants_expr(expr=expr)
    vds = vds.annotate_variants_expr("va = va.clean")

    if not args.skip_writing_intermediate_vds:
        write_vds(vds, args.step1_output_vds)

    args.start_with_step = 2  # step 1 finished, so, if an error occurs and it goes to retry, start with the next step

    return hc, vds
            .format(**locals()) for subpop in subpoulations
        ])

        vds = vds.annotate_variants_expr(
            "va.info.AF_POPMAX_OR_GLOBAL = [ va.info.AF[va.aIndex-1], {subpopulation_exprs} ].max()"
            .format(**locals()))

        top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
            root="va.clean",
            other_source_fields=USEFUL_TOP_LEVEL_FIELDS,
            other_source_root="va",
        )
        info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
            root="va.clean.info",
            other_source_fields=USEFUL_INFO_FIELDS,
            other_source_root="va.info",
        )

        expr = []
        if top_fields_expr:
            expr.append(top_fields_expr)
        if info_fields_expr:
            expr.append(info_fields_expr)

        vds = vds.annotate_variants_expr(expr=expr)
        vds = vds.annotate_variants_expr("va = va.clean")

        pprint(vds.variant_schema)

        write_vds(vds, GNOMAD_SEQR_VDS_PATHS[label])
Ejemplo n.º 7
0
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    kt = (
        hail_context
            .import_table(
                DBNSFP_FIELDS[dbnsfp_version]["source_path"],
                types=DBNSFP_FIELDS[dbnsfp_version]["field_types"],
                missing='.',
                min_partitions=10000)
            .drop(DBNSFP_FIELDS[dbnsfp_version]["fields_to_drop"])
            .rename(DBNSFP_FIELDS[dbnsfp_version]["rename_fields"])
            .filter("ref==alt", keep=False)
            .annotate("variant=Variant(chr, pos, ref, alt)")
            .key_by('variant')
            .drop(["chr", "pos", "ref", "alt"])

    )

    # create sites-only VDS
    dbnsfp_vds = VariantDataset.from_table(kt)

    output_path = DBNSFP_FIELDS[dbnsfp_version]["output_path"]

    dbnsfp_vds = dbnsfp_vds.annotate_global_expr('global.sourceFilePath = "{}"'.format(DBNSFP_FIELDS[dbnsfp_version]["source_path"]))
    dbnsfp_vds = dbnsfp_vds.annotate_global_expr('global.version = "{}"'.format(dbnsfp_version))

    write_vds(dbnsfp_vds, output_path)

    pprint(dbnsfp_vds.variant_schema)
if vds.num_partitions() < 50:
    print("Repartitioning")
    vds = vds.repartition(10000)

vds = vds.annotate_variants_expr(
    "va.originalAltAlleles=%s" % get_expr_for_orig_alt_alleles_set()
)  # save alt alleles before calling split_multi
vds = vds.split_multi()

#vds = vds.filter_alleles('v.altAlleles[aIndex-1].isStar()', keep=False)
filter_interval = "1-MT"
if args.subset:
    filter_interval = args.subset

logger.info("\n==> set filter interval to: %s" % (filter_interval, ))
vds = vds.filter_intervals(hail.Interval.parse(filter_interval))

summary = vds.summarize()
pprint.pprint(summary)
if summary.variants == 0:
    p.error(
        "0 variants in VDS. Make sure chromosome names don't contain 'chr'")

vds = run_vep(vds,
              genome_version=args.genome_version,
              block_size=args.block_size)

write_vds(vds, args.output_vds)

pprint.pprint(vds.variant_schema)