Ejemplo n.º 1
0
def main():
    all_datasets = pipeline_config.get("datasets", "datasets").split(",")
    parser = argparse.ArgumentParser()
    parser.add_argument("datasets",
                        nargs="*",
                        metavar=f"{{{','.join(all_datasets)}}}")
    args = parser.parse_args()

    if args.datasets:
        for dataset in args.datasets:
            if dataset not in all_datasets:
                print(
                    f"error: invalid dataset '{dataset}' (choose from {', '.join(all_datasets)})",
                    file=sys.stderr)
                return 1

        datasets_to_combine = args.datasets
    else:
        datasets_to_combine = all_datasets

    hl.init()

    output_path = os.path.join(pipeline_config.get("output", "staging_path"),
                               "combined.ht")
    combine_datasets(datasets_to_combine).write(output_path, overwrite=True)
def prepare_gene_models_helper(reference_genome):
    gencode_path = pipeline_config.get("reference_data", f"{reference_genome.lower()}_gencode_path")
    canonical_transcripts_path = pipeline_config.get(
        "reference_data", f"{reference_genome.lower()}_canonical_transcripts_path"
    )

    # Load genes from GTF file
    genes = load_gencode_gene_models(gencode_path, reference_genome)
    genes = genes.distinct()
    genes = genes.transmute(gencode_gene_symbol=genes.gene_symbol)

    # Annotate genes with canonical transcript
    canonical_transcripts = load_canonical_transcripts(canonical_transcripts_path)
    genes = genes.annotate(canonical_transcript_id=canonical_transcripts[genes.gene_id].transcript_id)

    # Drop transcripts except for canonical
    genes = genes.annotate(
        canonical_transcript=genes.transcripts.filter(
            lambda transcript: transcript.transcript_id == genes.canonical_transcript_id
        ).head()
    )
    genes = genes.annotate(
        canonical_transcript=genes.canonical_transcript.annotate(
            exons=hl.cond(
                genes.canonical_transcript.exons.any(lambda exon: exon.feature_type == "CDS"),
                genes.canonical_transcript.exons.filter(lambda exon: exon.feature_type == "CDS"),
                genes.canonical_transcript.exons.filter(lambda exon: exon.feature_type == "exon"),
            )
        )
    )
    genes = genes.drop("transcripts")

    return genes
Ejemplo n.º 3
0
def prepare_gene_results():
    ds = hl.import_table(
        pipeline_config.get("ASC", "gene_results_path"),
        missing="",
        types={
            "gene_name": hl.tstr,
            "gene_id": hl.tstr,
            "description": hl.tstr,
            "analysis_group": hl.tstr,
            "xcase_dn_ptv": hl.tint,
            "xcont_dn_ptv": hl.tint,
            "xcase_dn_misa": hl.tint,
            "xcont_dn_misa": hl.tint,
            "xcase_dn_misb": hl.tint,
            "xcont_dn_misb": hl.tint,
            "xcase_dbs_ptv": hl.tint,
            "xcont_dbs_ptv": hl.tint,
            "xcase_swe_ptv": hl.tint,
            "xcont_swe_ptv": hl.tint,
            "xcase_tut": hl.tint,
            "xcont_tut": hl.tint,
            "qval": hl.tfloat,
        },
    )

    ds = ds.drop("gene_name", "description")

    ds = ds.group_by("gene_id").aggregate(
        group_results=hl.agg.collect(ds.row_value))
    ds = ds.annotate(group_results=hl.dict(
        ds.group_results.map(lambda group_result:
                             (group_result.analysis_group,
                              group_result.drop("analysis_group")))))

    return ds
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print cluster creation command without running it")
    args = parser.parse_args()

    # Set working directory so that config.py finds pipeline_config.ini
    os.chdir(os.path.dirname(os.path.abspath(__file__)))

    from data_pipeline.config import pipeline_config  # pylint: disable=import-outside-toplevel

    command = [
        "hailctl",
        "dataproc",
        "start",
        "exome-results",
        "--max-idle=1h",
    ]

    for option in ["project", "region", "zone", "service-account"]:
        value = pipeline_config.get("dataproc", option, fallback=None)
        if value:
            command.append(f"--{option}={value}")

    print(" ".join(command[:4]) + " \\\n    " + " \\\n    ".join(command[4:]))
    if not args.dry_run:
        subprocess.check_call(command)
def prepare_variant_results():
    results_path = pipeline_config.get("SCHEMA", "variant_results_path")
    annotations_path = pipeline_config.get("SCHEMA",
                                           "variant_annotations_path")

    results = hl.read_table(results_path)

    results = results.drop("v", "af_case", "af_ctrl")

    # Add n_denovos to AC_case
    results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) +
                               hl.or_else(results.n_denovos, 0))

    results = results.annotate(
        source=hl.delimit(hl.sorted(hl.array(results.source)), ", "))

    results = results.group_by(
        "locus",
        "alleles").aggregate(group_results=hl.agg.collect(results.row_value))
    results = results.annotate(group_results=hl.dict(
        results.group_results.map(lambda group_result:
                                  (group_result.analysis_group,
                                   group_result.drop("analysis_group")))))

    variants = hl.read_table(annotations_path)
    variants = variants.select(
        gene_id=variants.gene_id,
        consequence=hl.case().when(
            (variants.canonical_term == "missense_variant") &
            (variants.mpc >= 3), "missense_variant_mpc_>=3").when(
                (variants.canonical_term == "missense_variant") &
                (variants.mpc >= 2), "missense_variant_mpc_2-3").when(
                    variants.canonical_term == "missense_variant",
                    "missense_variant_mpc_<2").default(
                        variants.canonical_term),
        hgvsc=variants.hgvsc_canonical.split(":")[-1],
        hgvsp=variants.hgvsp_canonical.split(":")[-1],
        info=hl.struct(cadd=variants.cadd,
                       mpc=variants.mpc,
                       polyphen=variants.polyphen),
    )

    variants = variants.annotate(**results[variants.key])
    variants = variants.filter(hl.is_defined(variants.group_results))

    return variants
def prepare_variant_results():
    results = hl.read_table(
        pipeline_config.get("BipEx", "variant_results_path"))

    # Get unique variants from results table
    variants = results.group_by(results.locus, results.alleles).aggregate()

    # Select AC/AF numbers for the alternate allele
    results = results.annotate(ac_case=results.ac_case[1],
                               ac_ctrl=results.ac_ctrl[1])

    results = results.drop("af_case", "af_ctrl")

    results = results.filter((results.ac_case > 0) | (results.ac_ctrl > 0))

    # Annotate variants with a struct for each analysis group
    results = results.group_by(
        "locus",
        "alleles").aggregate(group_results=hl.agg.collect(results.row_value))
    results = results.annotate(group_results=hl.dict(
        results.group_results.map(lambda group_result:
                                  (group_result.analysis_group,
                                   group_result.drop("analysis_group")))))

    variants = variants.annotate(**results[variants.locus, variants.alleles])

    # Merge variant annotations for canonical transcripts
    annotations = hl.read_table(
        pipeline_config.get("BipEx", "variant_annotations_path"))
    annotations = annotations.filter(
        annotations.transcript_id == annotations.canonical_transcript_id)

    annotations = annotations.select(
        "gene_id",
        consequence=annotations.csq_analysis,
        hgvsc=annotations.hgvsc_canonical.split(":")[-1],
        hgvsp=annotations.hgvsp_canonical.split(":")[-1],
        info=hl.struct(cadd=annotations.cadd,
                       mpc=annotations.mpc,
                       polyphen=annotations.polyphen),
    )

    variants = variants.annotate(**annotations[variants.locus,
                                               variants.alleles])

    return variants
def prepare_gene_models():
    genes_grch37 = prepare_gene_models_helper("GRCh37")
    genes_grch38 = prepare_gene_models_helper("GRCh38")

    genes_grch37 = genes_grch37.select(GRCh37=genes_grch37.row_value)
    genes_grch38 = genes_grch38.select(GRCh38=genes_grch38.row_value)

    genes = genes_grch37.join(genes_grch38, how="outer")

    # Annotate genes with information from HGNC
    hgnc_path = pipeline_config.get("reference_data", "hgnc_path")
    hgnc = load_hgnc(hgnc_path)
    genes = genes.annotate(**hgnc[genes.gene_id])
    genes = genes.annotate(
        symbol=hl.or_else(genes.symbol, hl.or_else(genes.GRCh38.gencode_gene_symbol, genes.GRCh37.gencode_gene_symbol)),
    )

    # Collect all fields that can be used to search by gene symbol
    genes = genes.annotate(
        search_terms=hl.set(
            hl.empty_array(hl.tstr)
            .append(genes.symbol)
            .extend(hl.or_else(genes.previous_symbols, hl.empty_array(hl.tstr)))
            .extend(hl.or_else(genes.alias_symbols, hl.empty_array(hl.tstr)))
            .append(genes.GRCh38.gencode_gene_symbol)
            .append(genes.GRCh37.gencode_gene_symbol)
            .filter(hl.is_defined)
            .map(lambda s: s.upper())
        ),
    )

    gnomad_constraint_path = pipeline_config.get("reference_data", "gnomad_constraint_path")
    gnomad_constraint = prepare_gnomad_constraint(gnomad_constraint_path)
    genes = genes.annotate(gnomad_constraint=gnomad_constraint[genes.GRCh37.canonical_transcript_id])

    exac_constraint_path = pipeline_config.get("reference_data", "exac_constraint_path")
    exac_constraint = prepare_exac_constraint(exac_constraint_path)
    genes = genes.annotate(exac_constraint=exac_constraint[genes.GRCh37.canonical_transcript_id])

    staging_path = pipeline_config.get("output", "staging_path")

    genes.write(f"{staging_path}/gene_models.ht", overwrite=True)
Ejemplo n.º 8
0
def prepare_gene_results():
    ds = hl.import_table(
        pipeline_config.get("Epi25", "gene_results_path"),
        delimiter=",",
        missing="NA",
        quote='"',
        types={
            "gene_id": hl.tstr,
            "gene_name": hl.tstr,
            "description": hl.tstr,
            "pval_meta": hl.tfloat,
            "analysis_group": hl.tstr,
            # LoF
            "xcase_lof": hl.tint,
            "xctrl_lof": hl.tint,
            "pval_lof": hl.tfloat,
            # MPC
            "xcase_mpc": hl.tint,
            "xctrl_mpc": hl.tint,
            "pval_mpc": hl.tfloat,
            # Inframe indel
            "xcase_infrIndel": hl.tint,
            "xctrl_infrIndel": hl.tint,
            "pval_infrIndel": hl.tfloat,
        },
    )

    ds = ds.drop("gene_name", "description")

    # Rename EE group to DEE
    ds = ds.annotate(analysis_group=hl.if_else(ds.analysis_group == "EE", "DEE", ds.analysis_group))

    # "Meta" p-val was carried over from SCHEMA's data format but isn't descriptive of Epi25
    ds = ds.rename({"pval_meta": "pval"})

    ds = ds.group_by("gene_id").aggregate(group_results=hl.agg.collect(ds.row_value))
    ds = ds.annotate(
        group_results=hl.dict(
            ds.group_results.map(
                lambda group_result: (group_result.analysis_group, group_result.drop("gene_id", "analysis_group"))
            )
        )
    )

    return ds
def main():
    # Set working directory so that config.py finds pipeline_config.ini
    os.chdir(os.path.dirname(os.path.abspath(__file__)))

    from data_pipeline.config import pipeline_config  # pylint: disable=import-outside-toplevel

    command = [
        "hailctl",
        "dataproc",
        "stop",
        "exome-results",
    ]

    for option in ["project", "region"]:
        value = pipeline_config.get("dataproc", option, fallback=None)
        if value:
            command.append(f"--{option}={value}")

    print(" ".join(command[:4]) + " \\\n    " + " \\\n    ".join(command[4:]))
    subprocess.check_call(command)
def main():
    all_datasets = pipeline_config.get("datasets", "datasets").split(",")
    parser = argparse.ArgumentParser()
    parser.add_argument("datasets",
                        nargs="*",
                        metavar=f"{{{','.join(all_datasets)}}}")
    args = parser.parse_args()

    if args.datasets:
        for dataset in args.datasets:
            if dataset not in all_datasets:
                print(
                    f"error: invalid dataset '{dataset}' (choose from {', '.join(all_datasets)})",
                    file=sys.stderr)
                return 1

        datasets_to_prepare = args.datasets
    else:
        datasets_to_prepare = all_datasets

    hl.init()

    for dataset in datasets_to_prepare:
        prepare_downloads_for_dataset(dataset)
Ejemplo n.º 11
0
def prepare_gene_results():
    results = hl.read_table(pipeline_config.get("BipEx", "gene_results_path"))

    results = results.select_globals()

    # Select result fields, discard gene information
    results = results.select(
        "gene_id",
        "analysis_group",
        "case_count",
        "control_count",
        "n_cases",
        "n_controls",
        "fisher_pval",
        "fisher_OR",
        "fisher_gnom_non_psych_pval",
        "fisher_gnom_non_psych_OR",
        "CMH_pval",
        "CMH_OR",
        "CMH_gnom_non_psych_pval",
        "CMH_gnom_non_psych_OR",
    )

    # Drop result fields not shown in browser
    results = results.drop("fisher_pval", "fisher_OR", "CMH_pval", "CMH_OR")

    results = results.annotate(
        # fisher_OR=hl.float(results.fisher_OR),
        fisher_gnom_non_psych_OR=hl.float(results.fisher_gnom_non_psych_OR),
        # CMH_OR=hl.float(results.CMH_OR),
        CMH_gnom_non_psych_OR=hl.float(results.CMH_gnom_non_psych_OR),
    )

    final_results = None

    consequence_categories = results.aggregate(
        hl.agg.collect_as_set(results.consequence_category))
    per_category_fields = [
        "case_count",
        "control_count",
        # "fisher_pval",
        # "fisher_OR",
        "fisher_gnom_non_psych_pval",
        "fisher_gnom_non_psych_OR",
        # "CMH_pval",
        # "CMH_OR",
        "CMH_gnom_non_psych_pval",
        "CMH_gnom_non_psych_OR",
    ]
    for category in consequence_categories:
        category_results = results.filter(
            results.consequence_category == category)
        category_results = category_results.key_by("gene_id", "analysis_group")
        category_results = category_results.select(
            n_cases=category_results.n_cases,
            n_controls=category_results.n_controls,
            **{
                f"{category}_{field}": category_results[field]
                for field in per_category_fields
            },
        )

        if final_results:
            final_results = final_results.join(
                category_results.drop("n_cases", "n_controls"),
                "outer",
            )

            # N cases/controls should be the same for all consequence categories for a gene/analysis group.
            # However, if there are no variants of a certain consequence category found in a gene, then
            # N cases/controls for that gene/analysis group/consequence category will be missing.
            final_results = final_results.annotate(
                n_cases=hl.or_else(
                    final_results.n_cases,
                    category_results[final_results.gene_id,
                                     final_results.analysis_group].n_cases),
                n_controls=hl.or_else(
                    final_results.n_controls,
                    category_results[final_results.gene_id,
                                     final_results.analysis_group].n_controls,
                ),
            )
        else:
            final_results = category_results

    final_results = final_results.group_by("gene_id").aggregate(
        group_results=hl.agg.collect(final_results.row.drop("gene_id")))
    final_results = final_results.annotate(group_results=hl.dict(
        final_results.group_results.map(
            lambda group_result: (group_result.analysis_group,
                                  group_result.drop("analysis_group")))))

    return final_results
def prepare_gene_results():
    ds = hl.import_table(
        pipeline_config.get("SCHEMA", "gene_results_path"),
        delimiter="\t",
        missing="NA",
        types={
            "Gene ID": hl.tstr,
            "Gene Symbol": hl.tstr,
            "Gene Name": hl.tstr,
            "Case PTV": hl.tint,
            "Ctrl PTV": hl.tint,
            "Case mis3": hl.tint,
            "Ctrl mis3": hl.tint,
            "Case mis2": hl.tint,
            "Ctrl mis2": hl.tint,
            "P ca/co (Class 1)": hl.tfloat,
            "P ca/co (Class 2)": hl.tfloat,
            "P ca/co (comb)": hl.tfloat,
            "De novo PTV": hl.tint,
            "De novo mis3": hl.tint,
            "De novo mis2": hl.tint,
            "P de novo": hl.tfloat,
            "P meta": hl.tfloat,
            "Q meta": hl.tfloat,
            "OR (PTV)": hl.tstr,
            "OR (Class I)": hl.tstr,
            "OR (Class II)": hl.tstr,
        },
    )

    # Parse upper and lower bounds out of odds ratio columns
    def _parse_odds_ratio(field_name):
        return hl.rbind(
            ds[field_name].split(" ", n=2),
            lambda parts: hl.rbind(
                parts[0],
                parts[1][1:-1].split("-", 2),
                lambda value, bounds: hl.struct(
                    **{
                        field_name: hl.float(value),
                        field_name + " lower bound": hl.float(bounds[0]),
                        field_name + " upper bound": hl.float(bounds[1]),
                    }),
            ),
        )

    ds = ds.transmute(**_parse_odds_ratio("OR (PTV)"))
    ds = ds.transmute(**_parse_odds_ratio("OR (Class I)"))
    ds = ds.transmute(**_parse_odds_ratio("OR (Class II)"))

    ds = ds.drop("Gene Symbol", "Gene Name")

    ds = ds.rename({"Gene ID": "gene_id"})
    ds = ds.key_by("gene_id")

    ds = ds.select(group_results=hl.dict([(
        "meta",
        hl.struct(**{field: ds[field]
                     for field in ds.row_value.dtype.fields}))]))

    return ds
def prepare_downloads_for_dataset(dataset_id):
    output_path = pipeline_config.get("output", "staging_path")

    dataset_prefix = os.path.join(output_path, dataset_id.lower())
    output_prefix = os.path.join(output_path, "downloads", dataset_id)

    gene_results_path = os.path.join(dataset_prefix, "gene_results.ht")
    gene_results = hl.read_table(gene_results_path)
    validate_gene_results_table(gene_results)

    gene_group_result_fields = gene_results.group_results.dtype.value_type.fields
    gene_results_dsv = gene_results
    gene_results_dsv = gene_results_dsv.transmute(group_results=hl.array(
        gene_results_dsv.group_results
    ).map(lambda group_and_result: group_and_result[1].annotate(
        group=group_and_result[0]).select("group", *gene_group_result_fields)))
    gene_results_dsv = gene_results_dsv.explode(gene_results_dsv.group_results,
                                                name="group_result")
    gene_results_dsv = gene_results_dsv.transmute(
        **gene_results_dsv.group_result)
    gene_results_dsv.export(
        os.path.join(output_prefix, f"{dataset_id}_gene_results.tsv.bgz"))

    variant_results_path = os.path.join(dataset_prefix, "variant_results.ht")
    variant_results = hl.read_table(variant_results_path)
    validate_variant_results_table(variant_results)

    variant_group_result_fields = variant_results.group_results.dtype.value_type.fields
    variant_results_dsv = variant_results
    variant_results_dsv = variant_results_dsv.transmute(
        **variant_results_dsv.info)
    variant_results_dsv = variant_results_dsv.transmute(
        group_results=hl.array(variant_results_dsv.group_results).map(
            lambda group_and_result: group_and_result[
                1].annotate(group=group_and_result[0]).select(
                    "group", *variant_group_result_fields)))
    variant_results_dsv = variant_results_dsv.explode(
        variant_results_dsv.group_results, name="group_result")
    variant_results_dsv = variant_results_dsv.transmute(
        **variant_results_dsv.group_result)
    variant_results_dsv.export(
        os.path.join(output_prefix, f"{dataset_id}_variant_results.tsv.bgz"))

    variant_results_groups = variant_results.aggregate(
        hl.agg.explode(hl.agg.collect_as_set,
                       variant_results.group_results.keys()))

    variant_info_fields = variant_results.info.dtype.fields
    variant_base_fields = set(
        variant_results.row_value) - {"info", "group_results"}

    all_fields = list(variant_base_fields) + list(variant_info_fields) + list(
        variant_group_result_fields)
    assert len(all_fields) == len(set(all_fields)), "Conflicting field names"

    variant_results_vcf = variant_results
    variant_results_vcf = variant_results_vcf.annotate(
        groups=variant_results_vcf.group_results.keys())
    variant_results_vcf = variant_results_vcf.select(info=hl.struct(
        **{f: variant_results_vcf[f]
           for f in variant_base_fields},
        **{f: variant_results_vcf.info[f]
           for f in variant_info_fields},
        groups=variant_results_vcf.groups,
        **dict(
            map(
                lambda f: (
                    f,
                    variant_results_vcf.groups.map(
                        lambda group: variant_results_vcf.group_results[group][
                            f]),
                ),
                variant_group_result_fields,
            )),
    ), )

    def _convert_type(field):
        if isinstance(field.dtype, hl.tarray):
            if field.dtype.element_type == hl.tbool:
                return field.map(hl.int)

        return field

    variant_results_vcf = variant_results_vcf.annotate(
        info=variant_results_vcf.info.annotate(**{
            f: _convert_type(variant_results_vcf.info[f])
            for f in all_fields
        }))

    with NamedTemporaryFile("w") as header_file:
        header_file.write(
            f"analysis_groups={','.join(variant_results_groups)}")

        hl.export_vcf(
            variant_results_vcf,
            os.path.join(output_prefix,
                         f"{dataset_id}_variant_results.vcf.bgz"),
            append_to_header=f"file://{header_file.name}",
            metadata={
                "info": {
                    **{
                        f: {
                            "Number": str(len(variant_results_groups))
                        }
                        for f in variant_group_result_fields
                    }
                }
            },
        )
Ejemplo n.º 14
0
def combine_datasets(dataset_ids):
    gene_models_path = f"{pipeline_config.get('output', 'staging_path')}/gene_models.ht"
    ds = hl.read_table(gene_models_path)

    ds = ds.annotate(gene_results=hl.struct(), variants=hl.struct())
    ds = ds.annotate_globals(
        meta=hl.struct(variant_fields=VARIANT_FIELDS, datasets=hl.struct()))

    for dataset_id in dataset_ids:
        dataset_path = os.path.join(
            pipeline_config.get("output", "staging_path"), dataset_id.lower())
        gene_results = hl.read_table(
            os.path.join(dataset_path, "gene_results.ht"))

        gene_group_result_field_names = gene_results.group_results.dtype.value_type.fields
        gene_group_result_field_types = [
            str(typ).rstrip("3264")
            for typ in gene_results.group_results.dtype.value_type.types
        ]
        gene_result_analysis_groups = list(
            gene_results.aggregate(
                hl.agg.explode(hl.agg.collect_as_set,
                               gene_results.group_results.keys())))

        gene_results = gene_results.annotate(group_results=hl.array([
            hl.tuple([
                gene_results.group_results.get(group)[field]
                for field in gene_group_result_field_names
            ]) for group in gene_result_analysis_groups
        ]))

        ds = ds.annotate(gene_results=ds.gene_results.annotate(
            **{dataset_id: gene_results[ds.gene_id]}))

        variant_results = hl.read_table(
            os.path.join(dataset_path, "variant_results.ht"))

        reference_genome = variant_results.locus.dtype.reference_genome.name
        variant_info_field_names = variant_results.info.dtype.fields
        variant_info_field_types = [
            str(typ).rstrip("3264") for typ in variant_results.info.dtype.types
        ]
        variant_group_result_field_names = variant_results.group_results.dtype.value_type.fields
        variant_group_result_field_types = [
            str(typ).rstrip("3264")
            for typ in variant_results.group_results.dtype.value_type.types
        ]
        variant_result_analysis_groups = list(
            variant_results.aggregate(
                hl.agg.explode(hl.agg.collect_as_set,
                               variant_results.group_results.keys())))

        variant_results = variant_results.annotate(
            info=hl.tuple([
                variant_results.info[field]
                for field in variant_info_field_names
            ]),
            group_results=hl.array([
                hl.rbind(
                    variant_results.group_results.get(group),
                    lambda group_result: hl.or_missing(
                        hl.is_defined(group_result),
                        hl.tuple([
                            group_result[field]
                            for field in variant_group_result_field_names
                        ]),
                    ),
                ) for group in variant_result_analysis_groups
            ]),
        )

        variant_results = variant_results.annotate(
            variant_id=variant_results.locus.contig.replace("^chr", "") + "-" +
            hl.str(variant_results.locus.position) + "-" +
            variant_results.alleles[0] + "-" + variant_results.alleles[1],
            pos=variant_results.locus.position,
        )

        variant_results = variant_results.annotate(variant=hl.tuple(
            [variant_results[field] for field in VARIANT_FIELDS]))
        variant_results = variant_results.group_by("gene_id").aggregate(
            variants=hl.agg.collect(variant_results.variant))
        ds = ds.annotate(variants=ds.variants.annotate(
            **{
                dataset_id:
                hl.or_else(
                    variant_results[ds.gene_id].variants,
                    hl.empty_array(
                        variant_results.variants.dtype.element_type),
                )
            }))

        ds = ds.annotate_globals(meta=ds.globals.meta.annotate(
            datasets=ds.globals.meta.datasets.annotate(
                **{
                    dataset_id:
                    hl.struct(
                        reference_genome=reference_genome,
                        gene_result_analysis_groups=gene_result_analysis_groups
                        or hl.empty_array(hl.tstr),
                        gene_group_result_field_names=
                        gene_group_result_field_names
                        or hl.empty_array(hl.tstr),
                        gene_group_result_field_types=
                        gene_group_result_field_types
                        or hl.empty_array(hl.tstr),
                        variant_info_field_names=variant_info_field_names
                        or hl.empty_array(hl.tstr),
                        variant_info_field_types=variant_info_field_types
                        or hl.empty_array(hl.tstr),
                        variant_result_analysis_groups=
                        variant_result_analysis_groups
                        or hl.empty_array(hl.tstr),
                        variant_group_result_field_names=
                        variant_group_result_field_names
                        or hl.empty_array(hl.tstr),
                        variant_group_result_field_types=
                        variant_group_result_field_types
                        or hl.empty_array(hl.tstr),
                    ),
                })))

    return ds
Ejemplo n.º 15
0
def prepare_variant_results():
    variant_results = hl.import_table(
        pipeline_config.get("Epi25", "variant_results_path"),
        force_bgz=True,
        min_partitions=100,
        key="Variant ID",
        missing="NA",
        types={
            "Variant ID": hl.tstr,
            "AC case": hl.tint,
            "AC control": hl.tint,
            "AF case": hl.tfloat,
            "AF control": hl.tfloat,
            "AN case": hl.tint,
            "AN control": hl.tint,
            "Analysis group": hl.tstr,
            "Estimate": hl.tfloat,
            "I2": hl.tfloat,
            "N denovos": hl.tint,
            "P-value": hl.tfloat,
            "Qp": hl.tfloat,
            "SE": hl.tfloat,
        },
    )

    variant_results = variant_results.rename(
        {
            "AC case": "ac_case",
            "AC control": "ac_ctrl",
            "AF case": "af_case",
            "AF control": "af_ctrl",
            "AN case": "an_case",
            "AN control": "an_ctrl",
            "Analysis group": "analysis_group",
        }, )

    # Rename "EE" analysis group to "DEE"
    variant_results = variant_results.annotate(
        analysis_group=hl.cond(variant_results.analysis_group == "EE", "DEE",
                               variant_results.analysis_group))

    variant_results = variant_results.drop("af_case", "af_ctrl")

    variant_results = variant_results.group_by("Variant ID").aggregate(
        group_results=hl.agg.collect(variant_results.row_value))
    variant_results = variant_results.annotate(group_results=hl.dict(
        variant_results.group_results.map(
            lambda group_result: (group_result.analysis_group,
                                  group_result.drop("analysis_group")))))

    variant_annotations = hl.import_table(
        pipeline_config.get("Epi25", "variant_annotations_path"),
        force_bgz=True,
        min_partitions=100,
        key="Variant ID",
        missing="NA",
        types={
            "Variant ID": hl.tstr,
            "CADD": hl.tfloat,
            "Comment": hl.tstr,
            "Consequence (canonical)": hl.tstr,
            "Consequence (for analysis)": hl.tstr,
            "Consequence (worst)": hl.tstr,
            "Flags": hl.tstr,
            "Gene ID": hl.tstr,
            "Gene name": hl.tstr,
            "HGVSc (canonical)": hl.tstr,
            "HGVSc": hl.tstr,
            "HGVSp (canonical)": hl.tstr,
            "HGVSp": hl.tstr,
            "In analysis": hl.tbool,
            "MPC": hl.tfloat,
            "Polyphen": hl.tstr,
            "Source": hl.tstr,
            "Transcript ID (canonical)": hl.tstr,
            "Transcript ID(s)": hl.tstr,
        },
    )

    variant_annotations = variant_annotations.rename({
        "CADD":
        "cadd",
        "Comment":
        "comment",
        "Consequence (canonical)":
        "csq_canonical",
        "Consequence (for analysis)":
        "csq_analysis",
        "Consequence (worst)":
        "csq_worst",
        "Flags":
        "flags",
        "Gene ID":
        "gene_id",
        "Gene name":
        "gene_name",
        "HGVSc (canonical)":
        "hgvsc_canonical",
        "HGVSc":
        "hgvsc",
        "HGVSp (canonical)":
        "hgvsp_canonical",
        "HGVSp":
        "hgvsp",
        "In analysis":
        "in_analysis",
        "MPC":
        "mpc",
        "Polyphen":
        "polyphen",
        "Source":
        "source",
        "Transcript ID (canonical)":
        "canonical_transcript_id",
        "Transcript ID(s)":
        "transcript_id",
    })

    variant_annotations = variant_annotations.select(
        "gene_id",
        consequence=variant_annotations.csq_analysis,
        hgvsc=variant_annotations.hgvsc_canonical.split(":")[-1],
        hgvsp=variant_annotations.hgvsp_canonical.split(":")[-1],
        info=hl.struct(
            comment=variant_annotations.comment,
            in_analysis=variant_annotations.in_analysis,
            cadd=variant_annotations.cadd,
            mpc=variant_annotations.mpc,
            polyphen=variant_annotations.polyphen,
        ),
    )

    variants = variant_annotations.annotate(
        group_results=variant_results[variant_annotations.key].group_results)

    variants = variants.annotate(
        locus=hl.rbind(
            variants["Variant ID"].split(":"),
            lambda p: hl.locus(p[0], hl.int(p[1]), reference_genome="GRCh37")),
        alleles=hl.rbind(variants["Variant ID"].split(":"),
                         lambda p: [p[2], p[3]]),
    )

    variants = variants.key_by("locus", "alleles")

    return variants
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("pipeline", choices=PIPELINES, help="Pipeline to run")
    parser.add_argument(
        "--environment",
        choices=("local", "dataproc"),
        default="local",
        help="Environment in which to run the pipeline (defaults to %(default)s",
    )
    parser.add_argument("--dry-run", action="store_true", help="Print pipeline command without running it")
    args, other_args = parser.parse_known_args()

    # Set working directory so that config.py finds pipeline_config.ini
    os.chdir(os.path.dirname(os.path.abspath(__file__)))

    from data_pipeline.config import pipeline_config  # pylint: disable=import-outside-toplevel

    start_time = time.time()

    if args.environment == "local":
        command = ["python3", "-m", f"data_pipeline.pipelines.{args.pipeline}"]

        if other_args:
            command.extend(other_args)

        print(" ".join(command[:2]) + " \\\n    " + " \\\n    ".join(command[2:]))
        if not args.dry_run:
            sys.path.insert(1, os.getcwd())
            try:
                subprocess.check_call(
                    command, env={**os.environ, "PYSPARK_SUBMIT_ARGS": "--driver-memory 4g pyspark-shell",},
                )

                elapsed_time = time.time() - start_time
                print(f"Done in {int(elapsed_time // 60)}m{int(elapsed_time % 60)}s")

            except subprocess.CalledProcessError:
                print(f"Error running data_pipeline/pipelines/{args.pipeline}.py")
                sys.exit(1)

    elif args.environment == "dataproc":
        # Zip contents of data_pipeline directory for upload to Dataproc cluster
        with tempfile.NamedTemporaryFile(prefix="pyfiles_", suffix=".zip") as tmp_file:
            with zipfile.ZipFile(tmp_file.name, "w", zipfile.ZIP_DEFLATED) as zip_file:
                for root, _, files in os.walk("data_pipeline"):
                    for name in files:
                        if name.endswith(".py"):
                            zip_file.write(
                                os.path.join(root, name), os.path.relpath(os.path.join(root, name)),
                            )

            # `hailctl dataproc submit` does not support project/region/zone arguments,
            # so use `gcloud dataproc jobs submit` instead.
            command = [
                "gcloud",
                "dataproc",
                "jobs",
                "submit",
                "pyspark",
            ]

            for option in ["project", "region"]:
                value = pipeline_config.get("dataproc", option, fallback=None)
                if value:
                    command.append(f"--{option}={value}")

            command.extend(
                [
                    "--cluster=exome-results",
                    f"--py-files={tmp_file.name}",
                    "--files=pipeline_config.ini",
                    f"data_pipeline/pipelines/{args.pipeline}.py",
                ]
            )

            if other_args:
                command.append("--")
                command.extend(other_args)

            print(" ".join(command[:5]) + " \\\n    " + " \\\n    ".join(command[5:]))
            if not args.dry_run:
                subprocess.check_call(command)

                elapsed_time = time.time() - start_time
                print(f"Done in {elapsed_time // 60}m{elapsed_time % 60}s")
def prepare_variant_results():
    annotations = None
    results = None

    for group in ("dn", "dbs", "swe"):
        group_annotations_path = pipeline_config.get("ASC", f"{group}_variant_annotations_path")
        group_results_path = pipeline_config.get("ASC", f"{group}_variant_results_path")

        group_annotations = hl.import_table(
            group_annotations_path,
            force=True,
            key="v",
            missing="NA",
            types={
                "v": hl.tstr,
                "in_analysis": hl.tbool,
                "gene_id": hl.tstr,
                "gene_name": hl.tstr,
                "transcript_id": hl.tstr,
                "hgvsc": hl.tstr,
                "hgvsp": hl.tstr,
                "csq_analysis": hl.tstr,
                "csq_worst": hl.tstr,
                "mpc": hl.tfloat,
                "polyphen": hl.tstr,
            },
        )

        group_annotations = group_annotations.repartition(100, shuffle=True)

        if annotations is None:
            annotations = group_annotations
        else:
            annotations = annotations.union(group_annotations)

        group_results = hl.import_table(
            group_results_path,
            force=True,
            min_partitions=100,
            key="v",
            missing="NA",
            types={
                "v": hl.tstr,
                "analysis_group": hl.tstr,
                "ac_case": hl.tint,
                "an_case": hl.tint,
                "af_case": hl.tstr,
                "ac_ctrl": hl.tint,
                "an_ctrl": hl.tint,
                "af_ctrl": hl.tstr,
            },
        )

        group_results = group_results.repartition(100, shuffle=True)

        group_results = group_results.drop("af_case", "af_ctrl")

        group_results = group_results.annotate(in_analysis=group_annotations[group_results.v].in_analysis)

        if results is None:
            results = group_results
        else:
            results = results.union(group_results)

    annotations = annotations.cache()
    results = results.cache()

    annotations = annotations.distinct()
    annotations = annotations.cache()

    annotations = annotations.select(
        "gene_id",
        consequence=hl.sorted(
            annotations.csq_analysis.split(","),
            lambda c: CONSEQUENCE_TERM_RANKS.get(c),  # pylint: disable=unnecessary-lambda
        )[0],
        hgvsc=annotations.hgvsc.split(":")[-1],
        hgvsp=annotations.hgvsp.split(":")[-1],
        info=hl.struct(mpc=annotations.mpc, polyphen=annotations.polyphen),
    )

    results = results.group_by("v").aggregate(group_results=hl.agg.collect(results.row_value))
    results = results.annotate(
        group_results=hl.dict(
            results.group_results.map(
                lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))
            )
        )
    )

    variants = annotations.annotate(group_results=results[annotations.key].group_results)

    variants = variants.annotate(
        locus=hl.rbind(variants.v.split(":"), lambda p: hl.locus(p[0], hl.int(p[1]), reference_genome="GRCh37")),
        alleles=hl.rbind(variants.v.split(":"), lambda p: [p[2], p[3]]),
    )

    variants = variants.key_by("locus", "alleles")

    return variants