def main(): all_datasets = pipeline_config.get("datasets", "datasets").split(",") parser = argparse.ArgumentParser() parser.add_argument("datasets", nargs="*", metavar=f"{{{','.join(all_datasets)}}}") args = parser.parse_args() if args.datasets: for dataset in args.datasets: if dataset not in all_datasets: print( f"error: invalid dataset '{dataset}' (choose from {', '.join(all_datasets)})", file=sys.stderr) return 1 datasets_to_combine = args.datasets else: datasets_to_combine = all_datasets hl.init() output_path = os.path.join(pipeline_config.get("output", "staging_path"), "combined.ht") combine_datasets(datasets_to_combine).write(output_path, overwrite=True)
def prepare_gene_models_helper(reference_genome): gencode_path = pipeline_config.get("reference_data", f"{reference_genome.lower()}_gencode_path") canonical_transcripts_path = pipeline_config.get( "reference_data", f"{reference_genome.lower()}_canonical_transcripts_path" ) # Load genes from GTF file genes = load_gencode_gene_models(gencode_path, reference_genome) genes = genes.distinct() genes = genes.transmute(gencode_gene_symbol=genes.gene_symbol) # Annotate genes with canonical transcript canonical_transcripts = load_canonical_transcripts(canonical_transcripts_path) genes = genes.annotate(canonical_transcript_id=canonical_transcripts[genes.gene_id].transcript_id) # Drop transcripts except for canonical genes = genes.annotate( canonical_transcript=genes.transcripts.filter( lambda transcript: transcript.transcript_id == genes.canonical_transcript_id ).head() ) genes = genes.annotate( canonical_transcript=genes.canonical_transcript.annotate( exons=hl.cond( genes.canonical_transcript.exons.any(lambda exon: exon.feature_type == "CDS"), genes.canonical_transcript.exons.filter(lambda exon: exon.feature_type == "CDS"), genes.canonical_transcript.exons.filter(lambda exon: exon.feature_type == "exon"), ) ) ) genes = genes.drop("transcripts") return genes
def prepare_gene_results(): ds = hl.import_table( pipeline_config.get("ASC", "gene_results_path"), missing="", types={ "gene_name": hl.tstr, "gene_id": hl.tstr, "description": hl.tstr, "analysis_group": hl.tstr, "xcase_dn_ptv": hl.tint, "xcont_dn_ptv": hl.tint, "xcase_dn_misa": hl.tint, "xcont_dn_misa": hl.tint, "xcase_dn_misb": hl.tint, "xcont_dn_misb": hl.tint, "xcase_dbs_ptv": hl.tint, "xcont_dbs_ptv": hl.tint, "xcase_swe_ptv": hl.tint, "xcont_swe_ptv": hl.tint, "xcase_tut": hl.tint, "xcont_tut": hl.tint, "qval": hl.tfloat, }, ) ds = ds.drop("gene_name", "description") ds = ds.group_by("gene_id").aggregate( group_results=hl.agg.collect(ds.row_value)) ds = ds.annotate(group_results=hl.dict( ds.group_results.map(lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) return ds
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--dry-run", action="store_true", help="Print cluster creation command without running it") args = parser.parse_args() # Set working directory so that config.py finds pipeline_config.ini os.chdir(os.path.dirname(os.path.abspath(__file__))) from data_pipeline.config import pipeline_config # pylint: disable=import-outside-toplevel command = [ "hailctl", "dataproc", "start", "exome-results", "--max-idle=1h", ] for option in ["project", "region", "zone", "service-account"]: value = pipeline_config.get("dataproc", option, fallback=None) if value: command.append(f"--{option}={value}") print(" ".join(command[:4]) + " \\\n " + " \\\n ".join(command[4:])) if not args.dry_run: subprocess.check_call(command)
def prepare_variant_results(): results_path = pipeline_config.get("SCHEMA", "variant_results_path") annotations_path = pipeline_config.get("SCHEMA", "variant_annotations_path") results = hl.read_table(results_path) results = results.drop("v", "af_case", "af_ctrl") # Add n_denovos to AC_case results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) + hl.or_else(results.n_denovos, 0)) results = results.annotate( source=hl.delimit(hl.sorted(hl.array(results.source)), ", ")) results = results.group_by( "locus", "alleles").aggregate(group_results=hl.agg.collect(results.row_value)) results = results.annotate(group_results=hl.dict( results.group_results.map(lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) variants = hl.read_table(annotations_path) variants = variants.select( gene_id=variants.gene_id, consequence=hl.case().when( (variants.canonical_term == "missense_variant") & (variants.mpc >= 3), "missense_variant_mpc_>=3").when( (variants.canonical_term == "missense_variant") & (variants.mpc >= 2), "missense_variant_mpc_2-3").when( variants.canonical_term == "missense_variant", "missense_variant_mpc_<2").default( variants.canonical_term), hgvsc=variants.hgvsc_canonical.split(":")[-1], hgvsp=variants.hgvsp_canonical.split(":")[-1], info=hl.struct(cadd=variants.cadd, mpc=variants.mpc, polyphen=variants.polyphen), ) variants = variants.annotate(**results[variants.key]) variants = variants.filter(hl.is_defined(variants.group_results)) return variants
def prepare_variant_results(): results = hl.read_table( pipeline_config.get("BipEx", "variant_results_path")) # Get unique variants from results table variants = results.group_by(results.locus, results.alleles).aggregate() # Select AC/AF numbers for the alternate allele results = results.annotate(ac_case=results.ac_case[1], ac_ctrl=results.ac_ctrl[1]) results = results.drop("af_case", "af_ctrl") results = results.filter((results.ac_case > 0) | (results.ac_ctrl > 0)) # Annotate variants with a struct for each analysis group results = results.group_by( "locus", "alleles").aggregate(group_results=hl.agg.collect(results.row_value)) results = results.annotate(group_results=hl.dict( results.group_results.map(lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) variants = variants.annotate(**results[variants.locus, variants.alleles]) # Merge variant annotations for canonical transcripts annotations = hl.read_table( pipeline_config.get("BipEx", "variant_annotations_path")) annotations = annotations.filter( annotations.transcript_id == annotations.canonical_transcript_id) annotations = annotations.select( "gene_id", consequence=annotations.csq_analysis, hgvsc=annotations.hgvsc_canonical.split(":")[-1], hgvsp=annotations.hgvsp_canonical.split(":")[-1], info=hl.struct(cadd=annotations.cadd, mpc=annotations.mpc, polyphen=annotations.polyphen), ) variants = variants.annotate(**annotations[variants.locus, variants.alleles]) return variants
def prepare_gene_models(): genes_grch37 = prepare_gene_models_helper("GRCh37") genes_grch38 = prepare_gene_models_helper("GRCh38") genes_grch37 = genes_grch37.select(GRCh37=genes_grch37.row_value) genes_grch38 = genes_grch38.select(GRCh38=genes_grch38.row_value) genes = genes_grch37.join(genes_grch38, how="outer") # Annotate genes with information from HGNC hgnc_path = pipeline_config.get("reference_data", "hgnc_path") hgnc = load_hgnc(hgnc_path) genes = genes.annotate(**hgnc[genes.gene_id]) genes = genes.annotate( symbol=hl.or_else(genes.symbol, hl.or_else(genes.GRCh38.gencode_gene_symbol, genes.GRCh37.gencode_gene_symbol)), ) # Collect all fields that can be used to search by gene symbol genes = genes.annotate( search_terms=hl.set( hl.empty_array(hl.tstr) .append(genes.symbol) .extend(hl.or_else(genes.previous_symbols, hl.empty_array(hl.tstr))) .extend(hl.or_else(genes.alias_symbols, hl.empty_array(hl.tstr))) .append(genes.GRCh38.gencode_gene_symbol) .append(genes.GRCh37.gencode_gene_symbol) .filter(hl.is_defined) .map(lambda s: s.upper()) ), ) gnomad_constraint_path = pipeline_config.get("reference_data", "gnomad_constraint_path") gnomad_constraint = prepare_gnomad_constraint(gnomad_constraint_path) genes = genes.annotate(gnomad_constraint=gnomad_constraint[genes.GRCh37.canonical_transcript_id]) exac_constraint_path = pipeline_config.get("reference_data", "exac_constraint_path") exac_constraint = prepare_exac_constraint(exac_constraint_path) genes = genes.annotate(exac_constraint=exac_constraint[genes.GRCh37.canonical_transcript_id]) staging_path = pipeline_config.get("output", "staging_path") genes.write(f"{staging_path}/gene_models.ht", overwrite=True)
def prepare_gene_results(): ds = hl.import_table( pipeline_config.get("Epi25", "gene_results_path"), delimiter=",", missing="NA", quote='"', types={ "gene_id": hl.tstr, "gene_name": hl.tstr, "description": hl.tstr, "pval_meta": hl.tfloat, "analysis_group": hl.tstr, # LoF "xcase_lof": hl.tint, "xctrl_lof": hl.tint, "pval_lof": hl.tfloat, # MPC "xcase_mpc": hl.tint, "xctrl_mpc": hl.tint, "pval_mpc": hl.tfloat, # Inframe indel "xcase_infrIndel": hl.tint, "xctrl_infrIndel": hl.tint, "pval_infrIndel": hl.tfloat, }, ) ds = ds.drop("gene_name", "description") # Rename EE group to DEE ds = ds.annotate(analysis_group=hl.if_else(ds.analysis_group == "EE", "DEE", ds.analysis_group)) # "Meta" p-val was carried over from SCHEMA's data format but isn't descriptive of Epi25 ds = ds.rename({"pval_meta": "pval"}) ds = ds.group_by("gene_id").aggregate(group_results=hl.agg.collect(ds.row_value)) ds = ds.annotate( group_results=hl.dict( ds.group_results.map( lambda group_result: (group_result.analysis_group, group_result.drop("gene_id", "analysis_group")) ) ) ) return ds
def main(): # Set working directory so that config.py finds pipeline_config.ini os.chdir(os.path.dirname(os.path.abspath(__file__))) from data_pipeline.config import pipeline_config # pylint: disable=import-outside-toplevel command = [ "hailctl", "dataproc", "stop", "exome-results", ] for option in ["project", "region"]: value = pipeline_config.get("dataproc", option, fallback=None) if value: command.append(f"--{option}={value}") print(" ".join(command[:4]) + " \\\n " + " \\\n ".join(command[4:])) subprocess.check_call(command)
def main(): all_datasets = pipeline_config.get("datasets", "datasets").split(",") parser = argparse.ArgumentParser() parser.add_argument("datasets", nargs="*", metavar=f"{{{','.join(all_datasets)}}}") args = parser.parse_args() if args.datasets: for dataset in args.datasets: if dataset not in all_datasets: print( f"error: invalid dataset '{dataset}' (choose from {', '.join(all_datasets)})", file=sys.stderr) return 1 datasets_to_prepare = args.datasets else: datasets_to_prepare = all_datasets hl.init() for dataset in datasets_to_prepare: prepare_downloads_for_dataset(dataset)
def prepare_gene_results(): results = hl.read_table(pipeline_config.get("BipEx", "gene_results_path")) results = results.select_globals() # Select result fields, discard gene information results = results.select( "gene_id", "analysis_group", "case_count", "control_count", "n_cases", "n_controls", "fisher_pval", "fisher_OR", "fisher_gnom_non_psych_pval", "fisher_gnom_non_psych_OR", "CMH_pval", "CMH_OR", "CMH_gnom_non_psych_pval", "CMH_gnom_non_psych_OR", ) # Drop result fields not shown in browser results = results.drop("fisher_pval", "fisher_OR", "CMH_pval", "CMH_OR") results = results.annotate( # fisher_OR=hl.float(results.fisher_OR), fisher_gnom_non_psych_OR=hl.float(results.fisher_gnom_non_psych_OR), # CMH_OR=hl.float(results.CMH_OR), CMH_gnom_non_psych_OR=hl.float(results.CMH_gnom_non_psych_OR), ) final_results = None consequence_categories = results.aggregate( hl.agg.collect_as_set(results.consequence_category)) per_category_fields = [ "case_count", "control_count", # "fisher_pval", # "fisher_OR", "fisher_gnom_non_psych_pval", "fisher_gnom_non_psych_OR", # "CMH_pval", # "CMH_OR", "CMH_gnom_non_psych_pval", "CMH_gnom_non_psych_OR", ] for category in consequence_categories: category_results = results.filter( results.consequence_category == category) category_results = category_results.key_by("gene_id", "analysis_group") category_results = category_results.select( n_cases=category_results.n_cases, n_controls=category_results.n_controls, **{ f"{category}_{field}": category_results[field] for field in per_category_fields }, ) if final_results: final_results = final_results.join( category_results.drop("n_cases", "n_controls"), "outer", ) # N cases/controls should be the same for all consequence categories for a gene/analysis group. # However, if there are no variants of a certain consequence category found in a gene, then # N cases/controls for that gene/analysis group/consequence category will be missing. final_results = final_results.annotate( n_cases=hl.or_else( final_results.n_cases, category_results[final_results.gene_id, final_results.analysis_group].n_cases), n_controls=hl.or_else( final_results.n_controls, category_results[final_results.gene_id, final_results.analysis_group].n_controls, ), ) else: final_results = category_results final_results = final_results.group_by("gene_id").aggregate( group_results=hl.agg.collect(final_results.row.drop("gene_id"))) final_results = final_results.annotate(group_results=hl.dict( final_results.group_results.map( lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) return final_results
def prepare_gene_results(): ds = hl.import_table( pipeline_config.get("SCHEMA", "gene_results_path"), delimiter="\t", missing="NA", types={ "Gene ID": hl.tstr, "Gene Symbol": hl.tstr, "Gene Name": hl.tstr, "Case PTV": hl.tint, "Ctrl PTV": hl.tint, "Case mis3": hl.tint, "Ctrl mis3": hl.tint, "Case mis2": hl.tint, "Ctrl mis2": hl.tint, "P ca/co (Class 1)": hl.tfloat, "P ca/co (Class 2)": hl.tfloat, "P ca/co (comb)": hl.tfloat, "De novo PTV": hl.tint, "De novo mis3": hl.tint, "De novo mis2": hl.tint, "P de novo": hl.tfloat, "P meta": hl.tfloat, "Q meta": hl.tfloat, "OR (PTV)": hl.tstr, "OR (Class I)": hl.tstr, "OR (Class II)": hl.tstr, }, ) # Parse upper and lower bounds out of odds ratio columns def _parse_odds_ratio(field_name): return hl.rbind( ds[field_name].split(" ", n=2), lambda parts: hl.rbind( parts[0], parts[1][1:-1].split("-", 2), lambda value, bounds: hl.struct( **{ field_name: hl.float(value), field_name + " lower bound": hl.float(bounds[0]), field_name + " upper bound": hl.float(bounds[1]), }), ), ) ds = ds.transmute(**_parse_odds_ratio("OR (PTV)")) ds = ds.transmute(**_parse_odds_ratio("OR (Class I)")) ds = ds.transmute(**_parse_odds_ratio("OR (Class II)")) ds = ds.drop("Gene Symbol", "Gene Name") ds = ds.rename({"Gene ID": "gene_id"}) ds = ds.key_by("gene_id") ds = ds.select(group_results=hl.dict([( "meta", hl.struct(**{field: ds[field] for field in ds.row_value.dtype.fields}))])) return ds
def prepare_downloads_for_dataset(dataset_id): output_path = pipeline_config.get("output", "staging_path") dataset_prefix = os.path.join(output_path, dataset_id.lower()) output_prefix = os.path.join(output_path, "downloads", dataset_id) gene_results_path = os.path.join(dataset_prefix, "gene_results.ht") gene_results = hl.read_table(gene_results_path) validate_gene_results_table(gene_results) gene_group_result_fields = gene_results.group_results.dtype.value_type.fields gene_results_dsv = gene_results gene_results_dsv = gene_results_dsv.transmute(group_results=hl.array( gene_results_dsv.group_results ).map(lambda group_and_result: group_and_result[1].annotate( group=group_and_result[0]).select("group", *gene_group_result_fields))) gene_results_dsv = gene_results_dsv.explode(gene_results_dsv.group_results, name="group_result") gene_results_dsv = gene_results_dsv.transmute( **gene_results_dsv.group_result) gene_results_dsv.export( os.path.join(output_prefix, f"{dataset_id}_gene_results.tsv.bgz")) variant_results_path = os.path.join(dataset_prefix, "variant_results.ht") variant_results = hl.read_table(variant_results_path) validate_variant_results_table(variant_results) variant_group_result_fields = variant_results.group_results.dtype.value_type.fields variant_results_dsv = variant_results variant_results_dsv = variant_results_dsv.transmute( **variant_results_dsv.info) variant_results_dsv = variant_results_dsv.transmute( group_results=hl.array(variant_results_dsv.group_results).map( lambda group_and_result: group_and_result[ 1].annotate(group=group_and_result[0]).select( "group", *variant_group_result_fields))) variant_results_dsv = variant_results_dsv.explode( variant_results_dsv.group_results, name="group_result") variant_results_dsv = variant_results_dsv.transmute( **variant_results_dsv.group_result) variant_results_dsv.export( os.path.join(output_prefix, f"{dataset_id}_variant_results.tsv.bgz")) variant_results_groups = variant_results.aggregate( hl.agg.explode(hl.agg.collect_as_set, variant_results.group_results.keys())) variant_info_fields = variant_results.info.dtype.fields variant_base_fields = set( variant_results.row_value) - {"info", "group_results"} all_fields = list(variant_base_fields) + list(variant_info_fields) + list( variant_group_result_fields) assert len(all_fields) == len(set(all_fields)), "Conflicting field names" variant_results_vcf = variant_results variant_results_vcf = variant_results_vcf.annotate( groups=variant_results_vcf.group_results.keys()) variant_results_vcf = variant_results_vcf.select(info=hl.struct( **{f: variant_results_vcf[f] for f in variant_base_fields}, **{f: variant_results_vcf.info[f] for f in variant_info_fields}, groups=variant_results_vcf.groups, **dict( map( lambda f: ( f, variant_results_vcf.groups.map( lambda group: variant_results_vcf.group_results[group][ f]), ), variant_group_result_fields, )), ), ) def _convert_type(field): if isinstance(field.dtype, hl.tarray): if field.dtype.element_type == hl.tbool: return field.map(hl.int) return field variant_results_vcf = variant_results_vcf.annotate( info=variant_results_vcf.info.annotate(**{ f: _convert_type(variant_results_vcf.info[f]) for f in all_fields })) with NamedTemporaryFile("w") as header_file: header_file.write( f"analysis_groups={','.join(variant_results_groups)}") hl.export_vcf( variant_results_vcf, os.path.join(output_prefix, f"{dataset_id}_variant_results.vcf.bgz"), append_to_header=f"file://{header_file.name}", metadata={ "info": { **{ f: { "Number": str(len(variant_results_groups)) } for f in variant_group_result_fields } } }, )
def combine_datasets(dataset_ids): gene_models_path = f"{pipeline_config.get('output', 'staging_path')}/gene_models.ht" ds = hl.read_table(gene_models_path) ds = ds.annotate(gene_results=hl.struct(), variants=hl.struct()) ds = ds.annotate_globals( meta=hl.struct(variant_fields=VARIANT_FIELDS, datasets=hl.struct())) for dataset_id in dataset_ids: dataset_path = os.path.join( pipeline_config.get("output", "staging_path"), dataset_id.lower()) gene_results = hl.read_table( os.path.join(dataset_path, "gene_results.ht")) gene_group_result_field_names = gene_results.group_results.dtype.value_type.fields gene_group_result_field_types = [ str(typ).rstrip("3264") for typ in gene_results.group_results.dtype.value_type.types ] gene_result_analysis_groups = list( gene_results.aggregate( hl.agg.explode(hl.agg.collect_as_set, gene_results.group_results.keys()))) gene_results = gene_results.annotate(group_results=hl.array([ hl.tuple([ gene_results.group_results.get(group)[field] for field in gene_group_result_field_names ]) for group in gene_result_analysis_groups ])) ds = ds.annotate(gene_results=ds.gene_results.annotate( **{dataset_id: gene_results[ds.gene_id]})) variant_results = hl.read_table( os.path.join(dataset_path, "variant_results.ht")) reference_genome = variant_results.locus.dtype.reference_genome.name variant_info_field_names = variant_results.info.dtype.fields variant_info_field_types = [ str(typ).rstrip("3264") for typ in variant_results.info.dtype.types ] variant_group_result_field_names = variant_results.group_results.dtype.value_type.fields variant_group_result_field_types = [ str(typ).rstrip("3264") for typ in variant_results.group_results.dtype.value_type.types ] variant_result_analysis_groups = list( variant_results.aggregate( hl.agg.explode(hl.agg.collect_as_set, variant_results.group_results.keys()))) variant_results = variant_results.annotate( info=hl.tuple([ variant_results.info[field] for field in variant_info_field_names ]), group_results=hl.array([ hl.rbind( variant_results.group_results.get(group), lambda group_result: hl.or_missing( hl.is_defined(group_result), hl.tuple([ group_result[field] for field in variant_group_result_field_names ]), ), ) for group in variant_result_analysis_groups ]), ) variant_results = variant_results.annotate( variant_id=variant_results.locus.contig.replace("^chr", "") + "-" + hl.str(variant_results.locus.position) + "-" + variant_results.alleles[0] + "-" + variant_results.alleles[1], pos=variant_results.locus.position, ) variant_results = variant_results.annotate(variant=hl.tuple( [variant_results[field] for field in VARIANT_FIELDS])) variant_results = variant_results.group_by("gene_id").aggregate( variants=hl.agg.collect(variant_results.variant)) ds = ds.annotate(variants=ds.variants.annotate( **{ dataset_id: hl.or_else( variant_results[ds.gene_id].variants, hl.empty_array( variant_results.variants.dtype.element_type), ) })) ds = ds.annotate_globals(meta=ds.globals.meta.annotate( datasets=ds.globals.meta.datasets.annotate( **{ dataset_id: hl.struct( reference_genome=reference_genome, gene_result_analysis_groups=gene_result_analysis_groups or hl.empty_array(hl.tstr), gene_group_result_field_names= gene_group_result_field_names or hl.empty_array(hl.tstr), gene_group_result_field_types= gene_group_result_field_types or hl.empty_array(hl.tstr), variant_info_field_names=variant_info_field_names or hl.empty_array(hl.tstr), variant_info_field_types=variant_info_field_types or hl.empty_array(hl.tstr), variant_result_analysis_groups= variant_result_analysis_groups or hl.empty_array(hl.tstr), variant_group_result_field_names= variant_group_result_field_names or hl.empty_array(hl.tstr), variant_group_result_field_types= variant_group_result_field_types or hl.empty_array(hl.tstr), ), }))) return ds
def prepare_variant_results(): variant_results = hl.import_table( pipeline_config.get("Epi25", "variant_results_path"), force_bgz=True, min_partitions=100, key="Variant ID", missing="NA", types={ "Variant ID": hl.tstr, "AC case": hl.tint, "AC control": hl.tint, "AF case": hl.tfloat, "AF control": hl.tfloat, "AN case": hl.tint, "AN control": hl.tint, "Analysis group": hl.tstr, "Estimate": hl.tfloat, "I2": hl.tfloat, "N denovos": hl.tint, "P-value": hl.tfloat, "Qp": hl.tfloat, "SE": hl.tfloat, }, ) variant_results = variant_results.rename( { "AC case": "ac_case", "AC control": "ac_ctrl", "AF case": "af_case", "AF control": "af_ctrl", "AN case": "an_case", "AN control": "an_ctrl", "Analysis group": "analysis_group", }, ) # Rename "EE" analysis group to "DEE" variant_results = variant_results.annotate( analysis_group=hl.cond(variant_results.analysis_group == "EE", "DEE", variant_results.analysis_group)) variant_results = variant_results.drop("af_case", "af_ctrl") variant_results = variant_results.group_by("Variant ID").aggregate( group_results=hl.agg.collect(variant_results.row_value)) variant_results = variant_results.annotate(group_results=hl.dict( variant_results.group_results.map( lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) variant_annotations = hl.import_table( pipeline_config.get("Epi25", "variant_annotations_path"), force_bgz=True, min_partitions=100, key="Variant ID", missing="NA", types={ "Variant ID": hl.tstr, "CADD": hl.tfloat, "Comment": hl.tstr, "Consequence (canonical)": hl.tstr, "Consequence (for analysis)": hl.tstr, "Consequence (worst)": hl.tstr, "Flags": hl.tstr, "Gene ID": hl.tstr, "Gene name": hl.tstr, "HGVSc (canonical)": hl.tstr, "HGVSc": hl.tstr, "HGVSp (canonical)": hl.tstr, "HGVSp": hl.tstr, "In analysis": hl.tbool, "MPC": hl.tfloat, "Polyphen": hl.tstr, "Source": hl.tstr, "Transcript ID (canonical)": hl.tstr, "Transcript ID(s)": hl.tstr, }, ) variant_annotations = variant_annotations.rename({ "CADD": "cadd", "Comment": "comment", "Consequence (canonical)": "csq_canonical", "Consequence (for analysis)": "csq_analysis", "Consequence (worst)": "csq_worst", "Flags": "flags", "Gene ID": "gene_id", "Gene name": "gene_name", "HGVSc (canonical)": "hgvsc_canonical", "HGVSc": "hgvsc", "HGVSp (canonical)": "hgvsp_canonical", "HGVSp": "hgvsp", "In analysis": "in_analysis", "MPC": "mpc", "Polyphen": "polyphen", "Source": "source", "Transcript ID (canonical)": "canonical_transcript_id", "Transcript ID(s)": "transcript_id", }) variant_annotations = variant_annotations.select( "gene_id", consequence=variant_annotations.csq_analysis, hgvsc=variant_annotations.hgvsc_canonical.split(":")[-1], hgvsp=variant_annotations.hgvsp_canonical.split(":")[-1], info=hl.struct( comment=variant_annotations.comment, in_analysis=variant_annotations.in_analysis, cadd=variant_annotations.cadd, mpc=variant_annotations.mpc, polyphen=variant_annotations.polyphen, ), ) variants = variant_annotations.annotate( group_results=variant_results[variant_annotations.key].group_results) variants = variants.annotate( locus=hl.rbind( variants["Variant ID"].split(":"), lambda p: hl.locus(p[0], hl.int(p[1]), reference_genome="GRCh37")), alleles=hl.rbind(variants["Variant ID"].split(":"), lambda p: [p[2], p[3]]), ) variants = variants.key_by("locus", "alleles") return variants
def main(): parser = argparse.ArgumentParser() parser.add_argument("pipeline", choices=PIPELINES, help="Pipeline to run") parser.add_argument( "--environment", choices=("local", "dataproc"), default="local", help="Environment in which to run the pipeline (defaults to %(default)s", ) parser.add_argument("--dry-run", action="store_true", help="Print pipeline command without running it") args, other_args = parser.parse_known_args() # Set working directory so that config.py finds pipeline_config.ini os.chdir(os.path.dirname(os.path.abspath(__file__))) from data_pipeline.config import pipeline_config # pylint: disable=import-outside-toplevel start_time = time.time() if args.environment == "local": command = ["python3", "-m", f"data_pipeline.pipelines.{args.pipeline}"] if other_args: command.extend(other_args) print(" ".join(command[:2]) + " \\\n " + " \\\n ".join(command[2:])) if not args.dry_run: sys.path.insert(1, os.getcwd()) try: subprocess.check_call( command, env={**os.environ, "PYSPARK_SUBMIT_ARGS": "--driver-memory 4g pyspark-shell",}, ) elapsed_time = time.time() - start_time print(f"Done in {int(elapsed_time // 60)}m{int(elapsed_time % 60)}s") except subprocess.CalledProcessError: print(f"Error running data_pipeline/pipelines/{args.pipeline}.py") sys.exit(1) elif args.environment == "dataproc": # Zip contents of data_pipeline directory for upload to Dataproc cluster with tempfile.NamedTemporaryFile(prefix="pyfiles_", suffix=".zip") as tmp_file: with zipfile.ZipFile(tmp_file.name, "w", zipfile.ZIP_DEFLATED) as zip_file: for root, _, files in os.walk("data_pipeline"): for name in files: if name.endswith(".py"): zip_file.write( os.path.join(root, name), os.path.relpath(os.path.join(root, name)), ) # `hailctl dataproc submit` does not support project/region/zone arguments, # so use `gcloud dataproc jobs submit` instead. command = [ "gcloud", "dataproc", "jobs", "submit", "pyspark", ] for option in ["project", "region"]: value = pipeline_config.get("dataproc", option, fallback=None) if value: command.append(f"--{option}={value}") command.extend( [ "--cluster=exome-results", f"--py-files={tmp_file.name}", "--files=pipeline_config.ini", f"data_pipeline/pipelines/{args.pipeline}.py", ] ) if other_args: command.append("--") command.extend(other_args) print(" ".join(command[:5]) + " \\\n " + " \\\n ".join(command[5:])) if not args.dry_run: subprocess.check_call(command) elapsed_time = time.time() - start_time print(f"Done in {elapsed_time // 60}m{elapsed_time % 60}s")
def prepare_variant_results(): annotations = None results = None for group in ("dn", "dbs", "swe"): group_annotations_path = pipeline_config.get("ASC", f"{group}_variant_annotations_path") group_results_path = pipeline_config.get("ASC", f"{group}_variant_results_path") group_annotations = hl.import_table( group_annotations_path, force=True, key="v", missing="NA", types={ "v": hl.tstr, "in_analysis": hl.tbool, "gene_id": hl.tstr, "gene_name": hl.tstr, "transcript_id": hl.tstr, "hgvsc": hl.tstr, "hgvsp": hl.tstr, "csq_analysis": hl.tstr, "csq_worst": hl.tstr, "mpc": hl.tfloat, "polyphen": hl.tstr, }, ) group_annotations = group_annotations.repartition(100, shuffle=True) if annotations is None: annotations = group_annotations else: annotations = annotations.union(group_annotations) group_results = hl.import_table( group_results_path, force=True, min_partitions=100, key="v", missing="NA", types={ "v": hl.tstr, "analysis_group": hl.tstr, "ac_case": hl.tint, "an_case": hl.tint, "af_case": hl.tstr, "ac_ctrl": hl.tint, "an_ctrl": hl.tint, "af_ctrl": hl.tstr, }, ) group_results = group_results.repartition(100, shuffle=True) group_results = group_results.drop("af_case", "af_ctrl") group_results = group_results.annotate(in_analysis=group_annotations[group_results.v].in_analysis) if results is None: results = group_results else: results = results.union(group_results) annotations = annotations.cache() results = results.cache() annotations = annotations.distinct() annotations = annotations.cache() annotations = annotations.select( "gene_id", consequence=hl.sorted( annotations.csq_analysis.split(","), lambda c: CONSEQUENCE_TERM_RANKS.get(c), # pylint: disable=unnecessary-lambda )[0], hgvsc=annotations.hgvsc.split(":")[-1], hgvsp=annotations.hgvsp.split(":")[-1], info=hl.struct(mpc=annotations.mpc, polyphen=annotations.polyphen), ) results = results.group_by("v").aggregate(group_results=hl.agg.collect(results.row_value)) results = results.annotate( group_results=hl.dict( results.group_results.map( lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group")) ) ) ) variants = annotations.annotate(group_results=results[annotations.key].group_results) variants = variants.annotate( locus=hl.rbind(variants.v.split(":"), lambda p: hl.locus(p[0], hl.int(p[1]), reference_genome="GRCh37")), alleles=hl.rbind(variants.v.split(":"), lambda p: [p[2], p[3]]), ) variants = variants.key_by("locus", "alleles") return variants