Esempio n. 1
0
def populate_gtex():
    meta_ht = hl.import_table(
        '/home/ml2529/gtex_data/GTEx_v7_Annotations_SampleAttributesDS.txt',
        delimiter='\t',
        key='SAMPID')
    mt = hl.import_matrix_table('/home/ml2529/gtex_data/ENSG00000177732.tsv',
                                row_key='transcript_id',
                                row_fields={
                                    'transcript_id': hl.tstr,
                                    'gene_id': hl.tstr
                                },
                                entry_type=hl.tfloat32)
    #mt = hl.import_matrix_table('/home/ml2529/gtex_data/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt.bgz', row_key='transcript_id', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr},entry_type=hl.tfloat32)

    mt = mt.rename({'transcript_id': 'transcriptId', 'gene_id': 'geneId'})

    #pprint.pprint(meta_ht.describe())
    #pprint.pprint(gtex_mt.describe())

    mt = mt.annotate_cols(tissue=meta_ht[mt.col_id].SMTSD)

    #pprint.pprint(mt.describe())
    #pprint.pprint(mt.show(include_row_fields=True))

    cut_dict = {
        'tissue':
        hl.agg.filter(hl.is_defined(mt.tissue), hl.agg.counter(mt.tissue))
    }
    #pprint.pprint(cut_dict)

    cut_data = mt.aggregate_cols(hl.struct(**cut_dict))
    #pprint.pprint(cut_data.tissue)

    #call_stats = hl.agg.filter(mt.tissue == 'Lung', hl.agg.mean(mt.x))
    #pprint.pprint(call_stats)

    #mt = mt.annotate_rows(Lung=call_stats)
    #pprint.pprint(mt.show(include_row_fields=True))

    for x in sorted(cut_data['tissue'].keys()):
        #pprint.pprint(x)
        call_stats = hl.agg.filter(mt.tissue == x, hl.agg.mean(mt.x))
        mt = mt.transmute_rows(**{f"{tissue_abbr[x]}": call_stats})

    #pprint.pprint(mt.show(include_row_fields=True))

    ht = mt.rows()

    #ht.write('gtex_expression.ht',overwrite=True)

    export_ht_to_es(ht,
                    index_name='gtex_tissue_tpms_by_transcript',
                    index_type='tissue_tpms')
    '''
Esempio n. 2
0
def populate_dnms():
    #ht = hl.import_table('/home/ubuntu/data/merged_dnms_cohortFreq12_perFamily_variant_id_042320.tsv',delimiter='\t',impute=True,key='variant_id')

    ht = hl.import_table(
        '/home/ubuntu/data/SSC_denovo_wgs_cshl_variant_id_gene_id_conf.tsv',
        delimiter='\t',
        impute=True)

    #pprint.pprint(ht.describe())
    #pprint.pprint(ht.show())
    export_ht_to_es(ht, index_name='autism_dnms', index_type='variant')

    ht = hl.import_table(
        '/home/ubuntu/data/merged_dnms_cohortFreq12_perFamily_variant_id_042320.tsv',
        delimiter='\t',
        impute=True)
    export_ht_to_es(ht, index_name='autism_dnms', index_type='variant')
Esempio n. 3
0
def populate_gtex():
    ht = hl.import_table(
        '/home/ml2529/gtex_data/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm_medians_by_tissue_wo_versions.tsv.gz',
        delimiter='\t',
        key='transcript_id',
        force_bgz=True,
        impute=True)
    #mt = hl.import_matrix_table('/home/ml2529/gtex_data/ENSG00000177732.tsv', row_key='transcript_id', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr},entry_type=hl.tfloat32)
    #mt = hl.import_matrix_table('/home/ml2529/gtex_data/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt.bgz', row_key='transcript_id', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr},entry_type=hl.tfloat32)

    ht = ht.rename({'transcript_id': 'transcriptId', 'gene_id': 'geneId'})
    #pprint.pprint(ht.describe())
    #pprint.pprint(ht.show())

    #ht.write('gtex_expression.ht',overwrite=True)

    export_ht_to_es(ht,
                    index_name='gtex_tissue_tpms_by_transcript',
                    index_type='tissue_tpms')
Esempio n. 4
0
def populate_clinvar():

    #clinvar_release_date = _parse_clinvar_release_date('clinvar.vcf.gz')
    #mt = import_vcf('clinvar.vcf.gz', "37", drop_samples=True, min_partitions=2000, skip_invalid_loci=True)
    #mt = mt.annotate_globals(version=clinvar_release_date)


    '''
    print("\n=== Running VEP ===")
    mt = hl.vep(mt, 'vep85-loftee-local.json', name="vep")

    print("\n=== Processing ===")
    mt = mt.annotate_rows(
        sortedTranscriptConsequences=get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep)
    )

    mt = mt.annotate_rows(
        main_transcript=get_expr_for_worst_transcript_consequence_annotations_struct(
            vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences
        )
    )

    mt = mt.annotate_rows(
        gene_ids=get_expr_for_vep_gene_ids_set(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences
        ),
    )

    review_status_str = hl.delimit(hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)), key=lambda s: s.replace("^_", "z")))

    mt = mt.select_rows(
        allele_id=mt.info.ALLELEID,
        alt=get_expr_for_alt_allele(mt),
        chrom=get_expr_for_contig(mt.locus),
        clinical_significance=hl.delimit(hl.sorted(hl.array(hl.set(mt.info.CLNSIG)), key=lambda s: s.replace("^_", "z"))),
        domains=get_expr_for_vep_protein_domains_set(vep_transcript_consequences_root=mt.vep.transcript_consequences),
        gene_ids=mt.gene_ids,
        gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map(
            vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences,
            gene_ids=mt.gene_ids
        ),
        gold_stars=CLINVAR_GOLD_STARS_LOOKUP[review_status_str],
        **{f"main_transcript_{field}": mt.main_transcript[field] for field in mt.main_transcript.dtype.fields},
        pos=get_expr_for_start_pos(mt),
        ref=get_expr_for_ref_allele(mt),
        review_status=review_status_str,
        transcript_consequence_terms=get_expr_for_vep_consequence_terms_set(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences
        ),
        transcript_ids=get_expr_for_vep_transcript_ids_set(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences
        ),
        transcript_id_to_consequence_json=get_expr_for_vep_transcript_id_to_consequence_map(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences
        ),
        variant_id=get_expr_for_variant_id(mt),
        xpos=get_expr_for_xpos(mt.locus),
    )

    #print("\n=== Summary ===")
    #hl.summarize_variants(mt)


    # Drop key columns for export
    rows = mt.rows()
    rows = rows.order_by(rows.variant_id).drop("locus", "alleles")
    rows.write('clinvar.ht',overwrite=True)
    '''
    print("\n=== Exporting to Elasticsearch ===")
    rows = hl.read_table('clinvar.ht')
    export_ht_to_es(rows, index_name = 'clinvar_grch37',index_type = 'variant')
Esempio n. 5
0
def run_pipeline(args):
    hl.init(log='./hail_annotation_pipeline.log')
    ht = hl.read_table(args.input)
    export_ht_to_es(ht,index_name='ssc_genomes')
Esempio n. 6
0
def run_pipeline(args):
    hl.init(log='./hail_annotation_pipeline.log')
    ht = hl.read_table(args.input)
    export_ht_to_es(ht, index_name='gnomad_structural_variants')
Esempio n. 7
0
def populate_constraint():

    #ds = hl.read_table('gnomad.v2.1.1.lof_metrics.by_transcript.ht')
    #ds = hl.import_table('constraint_final_standard.txt.bgz',delimiter='\t',key='transcript',impute=True)
    ds = hl.import_table('constraint_final_cleaned.txt.bgz',delimiter='\t',key='transcript',impute=True)

    #ds = hl.import_table('missing_small.txt',delimiter='\t',key='transcript',impute=True)

    # The globals in the Hail table cause a serialization error during Elasticsearch export
    ds = ds.select_globals()
    pprint.pprint(ds.describe())
    '''
    population_dict_fields = [
        "pop_no_lofs",
        "pop_obs_het_lof",
        "pop_obs_hom_lof",
        "pop_defined",
        "pop_p",
    ]

    populations = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]

    # Convert dicts to structs for Elasticsearch export
    ds = ds.annotate(
        **{
            f: hl.struct(**{pop: ds[f][pop] for pop in populations})
            for f in population_dict_fields
        }
    )

    '''

    # Convert interval to struct for Elasticsearch export
    '''
    ds = ds.annotate(
        interval=hl.struct(
            chrom=ds.interval.start.contig,
            start=ds.interval.start.position,
            end=ds.interval.end.position,
        )
    )
    
    ds = ds.key_by()
    '''
    ds = ds.transmute(gene_name=ds.gene, transcript_id=ds.transcript)

    #ds.write(args.output_url)

    '''
    ds = ds.select('exp_lof','exp_mis','exp_syn','obs_lof','obs_mis','obs_syn',
                    'oe_lof','oe_lof_lower','oe_mis','oe_mis_lower','oe_mis_upper',
                    'oe_syn','oe_syn_lower','oe_syn_upper',
                    'lof_z','mis_z','syn_z',
                    'pLI','pNull','pRec')
    '''
    '''
    ds = ds.select('exp_lof','exp_mis','exp_syn','obs_lof','obs_mis','obs_syn',
                    'lof_z','mis_z','syn_z',
                    'pLI','pNull','pRec')

    '''
    pprint.pprint(ds.describe())
    pprint.pprint(ds.show())

    export_ht_to_es(ds, index_name = 'gnomad_constraint_2_1_1',index_type = 'constraint')