hc = hail.HailContext(log="/hail.log") logger.info("\n==> import vds: " + input_vds_path) vds = hc.read(input_vds_path) parallel_computed_annotation_exprs = [ "va.variantId = %s" % get_expr_for_variant_id(), "va.contig = %s" % get_expr_for_contig(), "va.start = %s" % get_expr_for_start_pos(), "va.pos = %s" % get_expr_for_start_pos(), "va.end = %s" % get_expr_for_end_pos(), "va.ref = %s" % get_expr_for_ref_allele(), "va.alt = %s" % get_expr_for_alt_allele(), "va.xpos = %s" % get_expr_for_xpos(pos_field="start"), "va.xstart = %s" % get_expr_for_xpos(pos_field="start"), ] serial_computed_annotation_exprs = [ "va.xstop = %s" % get_expr_for_xpos(field_prefix="va.", pos_field="end"), ] vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs) vds = vds.annotate_variants_expr(serial_computed_annotation_exprs) # apply schema to dataset INPUT_SCHEMA = { "top_level_fields": """ variantId: String, originalAltAlleles: Set[String],
ReadPosRankSum: Double, SOR: Double, VQSLOD: Double, culprit: String, AC_Hom: Array[Int], AC_Het: Array[Int], AC_Hemi: Array[Int], """ } vds_computed_annotations_exprs = [ "va.chrom = %s" % get_expr_for_contig(), "va.pos = %s" % get_expr_for_start_pos(), "va.ref = %s" % get_expr_for_ref_allele(), "va.alt = %s" % get_expr_for_alt_allele(), "va.xpos = %s" % get_expr_for_xpos(), "va.variantId = %s" % get_expr_for_variant_id(), "va.originalAltAlleles = %s" % get_expr_for_orig_alt_alleles_set(), "va.geneIds = %s" % get_expr_for_vep_gene_ids_set(), "va.transcriptIds = %s" % get_expr_for_vep_transcript_ids_set(), "va.transcriptConsequenceTerms = %s" % get_expr_for_vep_consequence_terms_set(), "va.sortedTranscriptConsequences = %s" % get_expr_for_vep_sorted_transcript_consequences_array(), "va.mainTranscript = %s" % get_expr_for_worst_transcript_consequence_annotations_struct("va.sortedTranscriptConsequences"), "va.sortedTranscriptConsequences = json(va.sortedTranscriptConsequences)" ] print("======== Exomes: KT Schema ========") for expr in vds_computed_annotations_exprs: vds = vds.annotate_variants_expr(expr) kt_variant_expr = convert_vds_schema_string_to_vds_make_table_arg(split_multi=False, **SCHIZOPHRENIA_SCHEMA)
pprint(kt_rare_variants.schema) ES_HOST_IP = '10.4.0.13' ES_HOST_PORT = 9200 print("======== Export to elasticsearch ======") es = ElasticsearchClient( host=ES_HOST_IP, port=ES_HOST_PORT, ) annotation_expressions = [ 'variant_id = %s' % get_expr_for_variant_id(), 'chrom = %s' % get_expr_for_contig(), 'pos = %s' % get_expr_for_start_pos(), "xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="pos"), ] for expression in annotation_expressions: kt_rare_variants = kt_rare_variants.annotate(expression) kt_rare_variants = kt_rare_variants.drop(['v']) kt_annotations = kt_annotations.annotate('variantId = %s' % get_expr_for_variant_id()).drop(['v']) kt_rare_variants = kt_rare_variants.key_by('variantId').join(kt_annotations.key_by('variantId')) pprint(kt_rare_variants.schema) es.export_kt_to_elasticsearch( kt_rare_variants,
""" vds = vds.annotate_variants_expr("va.originalAltAlleles=%s" % get_expr_for_orig_alt_alleles_set()) vds = add_mpc_to_vds(hc, vds, args.genome_version, root="va.info", info_fields=MPC_INFO_FIELDS) pprint(vds.variant_schema) for expr in vds_computed_annotations_exprs: vds = vds.annotate_variants_expr(expr) kt_variant_expr = convert_vds_schema_string_to_vds_make_table_arg(**GNOMAD_SCHEMA) # print kt_variant_expr kt = vds.make_table(kt_variant_expr, []) # pprint(kt.schema) kt = kt.annotate("pos = start") kt = kt.annotate("stop = %s" % get_expr_for_end_pos(field_prefix="", pos_field="start", ref_field="ref")) kt = kt.annotate("xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="start")) kt = kt.annotate("xstart = %s" % get_expr_for_xpos(field_prefix="", pos_field="start")) kt = kt.annotate("xstop = %s" % get_expr_for_xpos(field_prefix="", pos_field="stop")) # flatten and prune mainTranscript transcript_annotations_to_keep = [ "amino_acids", "biotype", "canonical", "cdna_start", "cdna_end", "codons", #"distance", "domains", "exon", "gene_id",