Exemple #1
0
input_vds_path_prefix = input_vds_path.replace(".vds", "")

logger.info("\n==> create HailContext")
hc = hail.HailContext(log="/hail.log")

logger.info("\n==> import vds: " + input_vds_path)
vds = hc.read(input_vds_path)

parallel_computed_annotation_exprs = [
    "va.variantId = %s" % get_expr_for_variant_id(),
    
    "va.contig = %s" % get_expr_for_contig(),
    "va.start = %s" % get_expr_for_start_pos(),
    "va.pos = %s" % get_expr_for_start_pos(),
    "va.end = %s" % get_expr_for_end_pos(),
    "va.ref = %s" % get_expr_for_ref_allele(),
    "va.alt = %s" % get_expr_for_alt_allele(),
    
    "va.xpos = %s" % get_expr_for_xpos(pos_field="start"),
    "va.xstart = %s" % get_expr_for_xpos(pos_field="start"),
]

serial_computed_annotation_exprs = [
    "va.xstop = %s" % get_expr_for_xpos(field_prefix="va.", pos_field="end"),
]
vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs)
vds = vds.annotate_variants_expr(serial_computed_annotation_exprs)

# apply schema to dataset
INPUT_SCHEMA = {
    obs_exp: Double,
"""

vds = vds.annotate_variants_expr("va.originalAltAlleles=%s" % get_expr_for_orig_alt_alleles_set())
vds = add_mpc_to_vds(hc, vds, args.genome_version, root="va.info", info_fields=MPC_INFO_FIELDS)

pprint(vds.variant_schema)
for expr in vds_computed_annotations_exprs:
    vds = vds.annotate_variants_expr(expr)
kt_variant_expr = convert_vds_schema_string_to_vds_make_table_arg(**GNOMAD_SCHEMA)
# print kt_variant_expr
kt = vds.make_table(kt_variant_expr, [])
# pprint(kt.schema)

kt = kt.annotate("pos = start")
kt = kt.annotate("stop = %s" % get_expr_for_end_pos(field_prefix="", pos_field="start", ref_field="ref"))
kt = kt.annotate("xpos = %s" % get_expr_for_xpos(field_prefix="", pos_field="start"))
kt = kt.annotate("xstart = %s" % get_expr_for_xpos(field_prefix="", pos_field="start"))
kt = kt.annotate("xstop = %s" % get_expr_for_xpos(field_prefix="", pos_field="stop"))

# flatten and prune mainTranscript
transcript_annotations_to_keep = [
    "amino_acids",
    "biotype",
    "canonical",
    "cdna_start",
    "cdna_end",
    "codons",
    #"distance",
    "domains",
    "exon",