Ejemplo n.º 1
0
hc = hail.HailContext(log="/hail.log")

logger.info("\n==> import vds: " + input_vds_path)
vds = hc.read(input_vds_path)

parallel_computed_annotation_exprs = [
    "va.variantId = %s" % get_expr_for_variant_id(),
    
    "va.contig = %s" % get_expr_for_contig(),
    "va.start = %s" % get_expr_for_start_pos(),
    "va.pos = %s" % get_expr_for_start_pos(),
    "va.end = %s" % get_expr_for_end_pos(),
    "va.ref = %s" % get_expr_for_ref_allele(),
    "va.alt = %s" % get_expr_for_alt_allele(),
    
    "va.xpos = %s" % get_expr_for_xpos(pos_field="start"),
    "va.xstart = %s" % get_expr_for_xpos(pos_field="start"),
]

serial_computed_annotation_exprs = [
    "va.xstop = %s" % get_expr_for_xpos(field_prefix="va.", pos_field="end"),
]
vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs)
vds = vds.annotate_variants_expr(serial_computed_annotation_exprs)

# apply schema to dataset
INPUT_SCHEMA = {
    "top_level_fields": """
            variantId: String,
            originalAltAlleles: Set[String],
Ejemplo n.º 2
0
         ReadPosRankSum: Double,
         SOR: Double,
         VQSLOD: Double,
         culprit: String,
         AC_Hom: Array[Int],
         AC_Het: Array[Int],
         AC_Hemi: Array[Int],
    """
}

vds_computed_annotations_exprs = [
    "va.chrom = %s" % get_expr_for_contig(),
    "va.pos = %s" % get_expr_for_start_pos(),
    "va.ref = %s" % get_expr_for_ref_allele(),
    "va.alt = %s" % get_expr_for_alt_allele(),
    "va.xpos = %s" % get_expr_for_xpos(),

    "va.variantId = %s" % get_expr_for_variant_id(),
    "va.originalAltAlleles = %s" % get_expr_for_orig_alt_alleles_set(),
    "va.geneIds = %s" % get_expr_for_vep_gene_ids_set(),
    "va.transcriptIds = %s" % get_expr_for_vep_transcript_ids_set(),
    "va.transcriptConsequenceTerms = %s" % get_expr_for_vep_consequence_terms_set(),
    "va.sortedTranscriptConsequences = %s" % get_expr_for_vep_sorted_transcript_consequences_array(),
    "va.mainTranscript = %s" % get_expr_for_worst_transcript_consequence_annotations_struct("va.sortedTranscriptConsequences"),
    "va.sortedTranscriptConsequences = json(va.sortedTranscriptConsequences)"
]

print("======== Exomes: KT Schema ========")
for expr in vds_computed_annotations_exprs:
    vds = vds.annotate_variants_expr(expr)
kt_variant_expr = convert_vds_schema_string_to_vds_make_table_arg(split_multi=False, **SCHIZOPHRENIA_SCHEMA)
pprint(kt_rare_variants.schema)

ES_HOST_IP = '10.4.0.13'
ES_HOST_PORT = 9200

print("======== Export to elasticsearch ======")
es = ElasticsearchClient(
    host=ES_HOST_IP,
    port=ES_HOST_PORT,
)

annotation_expressions = [
    'variant_id = %s' % get_expr_for_variant_id(),
    'chrom = %s' % get_expr_for_contig(),
    'pos = %s' % get_expr_for_start_pos(),
    "xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="pos"),
]

for expression in annotation_expressions:
    kt_rare_variants = kt_rare_variants.annotate(expression)

kt_rare_variants = kt_rare_variants.drop(['v'])

kt_annotations = kt_annotations.annotate('variantId = %s' % get_expr_for_variant_id()).drop(['v'])

kt_rare_variants = kt_rare_variants.key_by('variantId').join(kt_annotations.key_by('variantId'))

pprint(kt_rare_variants.schema)

es.export_kt_to_elasticsearch(
    kt_rare_variants,
"""

vds = vds.annotate_variants_expr("va.originalAltAlleles=%s" % get_expr_for_orig_alt_alleles_set())
vds = add_mpc_to_vds(hc, vds, args.genome_version, root="va.info", info_fields=MPC_INFO_FIELDS)

pprint(vds.variant_schema)
for expr in vds_computed_annotations_exprs:
    vds = vds.annotate_variants_expr(expr)
kt_variant_expr = convert_vds_schema_string_to_vds_make_table_arg(**GNOMAD_SCHEMA)
# print kt_variant_expr
kt = vds.make_table(kt_variant_expr, [])
# pprint(kt.schema)

kt = kt.annotate("pos = start")
kt = kt.annotate("stop = %s" % get_expr_for_end_pos(field_prefix="", pos_field="start", ref_field="ref"))
kt = kt.annotate("xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="start"))
kt = kt.annotate("xstart = %s" % get_expr_for_xpos(field_prefix="", pos_field="start"))
kt = kt.annotate("xstop = %s" % get_expr_for_xpos(field_prefix="", pos_field="stop"))

# flatten and prune mainTranscript
transcript_annotations_to_keep = [
    "amino_acids",
    "biotype",
    "canonical",
    "cdna_start",
    "cdna_end",
    "codons",
    #"distance",
    "domains",
    "exon",
    "gene_id",