Beispiel #1
0
 def _create(self, resource_dir):
     tsv = 'random_doubles_mt.tsv.bgz'
     download(resource_dir, tsv)
     logging.info(f"downloading {tsv}")
     local_tsv = os.path.join(resource_dir, tsv)
     hl.import_matrix_table(local_tsv, row_key="row_idx", row_fields={"row_idx": hl.tint32}, entry_type=hl.tfloat64) \
         .write(os.path.join(resource_dir, "random_doubles_mt.mt"))
Beispiel #2
0
    def test_import_matrix_table(self):
        mt = hl.import_matrix_table(doctest_resource('matrix1.tsv'),
                                    row_fields={'Barcode': hl.tstr, 'Tissue': hl.tstr, 'Days': hl.tfloat32})
        self.assertEqual(mt['Barcode']._indices, mt._row_indices)
        self.assertEqual(mt['Tissue']._indices, mt._row_indices)
        self.assertEqual(mt['Days']._indices, mt._row_indices)
        self.assertEqual(mt['col_id']._indices, mt._col_indices)
        self.assertEqual(mt['row_id']._indices, mt._row_indices)

        mt.count()

        row_fields = {'f0': hl.tstr, 'f1': hl.tstr, 'f2': hl.tfloat32}
        hl.import_matrix_table(doctest_resource('matrix2.tsv'),
                               row_fields=row_fields, row_key=[]).count()
        hl.import_matrix_table(doctest_resource('matrix3.tsv'),
                               row_fields=row_fields,
                               no_header=True).count()
        hl.import_matrix_table(doctest_resource('matrix3.tsv'),
                               row_fields=row_fields,
                               no_header=True,
                               row_key=[]).count()
        self.assertRaises(hl.utils.FatalError,
                          hl.import_matrix_table,
                          doctest_resource('matrix3.tsv'),
                          row_fields=row_fields,
                          no_header=True,
                          row_key=['foo'])
Beispiel #3
0
    def test_import_matrix_table(self):
        mt = hl.import_matrix_table(doctest_resource('matrix1.tsv'),
                                    row_fields={'Barcode': hl.tstr, 'Tissue': hl.tstr, 'Days': hl.tfloat32})
        self.assertEqual(mt['Barcode']._indices, mt._row_indices)
        self.assertEqual(mt['Tissue']._indices, mt._row_indices)
        self.assertEqual(mt['Days']._indices, mt._row_indices)
        self.assertEqual(mt['col_id']._indices, mt._col_indices)
        self.assertEqual(mt['row_id']._indices, mt._row_indices)

        mt.count()

        row_fields = {'f0': hl.tstr, 'f1': hl.tstr, 'f2': hl.tfloat32}
        hl.import_matrix_table(doctest_resource('matrix2.tsv'),
                               row_fields=row_fields, row_key=[]).count()
        hl.import_matrix_table(doctest_resource('matrix3.tsv'),
                               row_fields=row_fields,
                               no_header=True).count()
        hl.import_matrix_table(doctest_resource('matrix3.tsv'),
                               row_fields=row_fields,
                               no_header=True,
                               row_key=[]).count()
        self.assertRaises(hl.utils.FatalError,
                     hl.import_matrix_table,
                          doctest_resource('matrix3.tsv'),
                     row_fields=row_fields,
                     no_header=True,
                     row_key=['foo'])
Beispiel #4
0
def populate_gtex():
    meta_ht = hl.import_table(
        '/home/ml2529/gtex_data/GTEx_v7_Annotations_SampleAttributesDS.txt',
        delimiter='\t',
        key='SAMPID')
    mt = hl.import_matrix_table('/home/ml2529/gtex_data/ENSG00000177732.tsv',
                                row_key='transcript_id',
                                row_fields={
                                    'transcript_id': hl.tstr,
                                    'gene_id': hl.tstr
                                },
                                entry_type=hl.tfloat32)
    #mt = hl.import_matrix_table('/home/ml2529/gtex_data/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt.bgz', row_key='transcript_id', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr},entry_type=hl.tfloat32)

    mt = mt.rename({'transcript_id': 'transcriptId', 'gene_id': 'geneId'})

    #pprint.pprint(meta_ht.describe())
    #pprint.pprint(gtex_mt.describe())

    mt = mt.annotate_cols(tissue=meta_ht[mt.col_id].SMTSD)

    #pprint.pprint(mt.describe())
    #pprint.pprint(mt.show(include_row_fields=True))

    cut_dict = {
        'tissue':
        hl.agg.filter(hl.is_defined(mt.tissue), hl.agg.counter(mt.tissue))
    }
    #pprint.pprint(cut_dict)

    cut_data = mt.aggregate_cols(hl.struct(**cut_dict))
    #pprint.pprint(cut_data.tissue)

    #call_stats = hl.agg.filter(mt.tissue == 'Lung', hl.agg.mean(mt.x))
    #pprint.pprint(call_stats)

    #mt = mt.annotate_rows(Lung=call_stats)
    #pprint.pprint(mt.show(include_row_fields=True))

    for x in sorted(cut_data['tissue'].keys()):
        #pprint.pprint(x)
        call_stats = hl.agg.filter(mt.tissue == x, hl.agg.mean(mt.x))
        mt = mt.transmute_rows(**{f"{tissue_abbr[x]}": call_stats})

    #pprint.pprint(mt.show(include_row_fields=True))

    ht = mt.rows()

    #ht.write('gtex_expression.ht',overwrite=True)

    export_ht_to_es(ht,
                    index_name='gtex_tissue_tpms_by_transcript',
                    index_type='tissue_tpms')
    '''
def prepare_gtex_expression_data(transcript_tpms_path, sample_annotations_path,
                                 tmp_path):
    # Recompress tpms file with block gzip so that import_matrix_table will read the file
    ds = hl.import_table(transcript_tpms_path, force=True)
    tmp_transcript_tpms_path = tmp_path + "/" + transcript_tpms_path.split(
        "/")[-1].replace(".gz", ".bgz")
    ds.export(tmp_transcript_tpms_path)

    # Import data
    ds = hl.import_matrix_table(
        tmp_transcript_tpms_path,
        row_fields={
            "transcript_id": hl.tstr,
            "gene_id": hl.tstr
        },
        entry_type=hl.tfloat,
    )
    ds = ds.rename({"col_id": "sample_id"})
    ds = ds.repartition(1000, shuffle=True)

    samples = hl.import_table(sample_annotations_path, key="SAMPID")

    # Separate version numbers from transcript and gene IDs
    ds = ds.annotate_rows(
        transcript_id=ds.transcript_id.split(r"\.")[0],
        transcript_version=hl.int(ds.transcript_id.split(r"\.")[1]),
        gene_id=ds.gene_id.split(r"\.")[0],
        gene_version=hl.int(ds.gene_id.split(r"\.")[1]),
    )

    # Annotate columns with the tissue the sample came from
    ds = ds.annotate_cols(tissue=samples[ds.sample_id].SMTSD)

    # Collect expression into median across all samples in each tissue
    ds = ds.group_cols_by(ds.tissue).aggregate(**{
        "": hl.agg.approx_median(ds.x)
    }).make_table()

    # Format tissue names
    other_fields = {
        "transcript_id", "transcript_version", "gene_id", "gene_version"
    }
    tissues = [f for f in ds.row_value.dtype.fields if f not in other_fields]
    ds = ds.transmute(tissues=hl.struct(
        **{format_tissue_name(tissue): ds[tissue]
           for tissue in tissues}))

    ds = ds.key_by("transcript_id").drop("row_id")

    return ds
def get_gtex_summary(gtex_rsem_path,
                     gtex_tx_summary_out_path,
                     get_medians=True):
    """
    Get GTEx RSEM table with ENSTs and ENSGs as rows and GTEx samples as columns (e.g. Muscle-Skeletal.12,
    Adipose.27 etc.) and write out a table with same rows, and tissues as columns (Muscle-Skeletal, Adipose etc.)
    with cells representing summary expression of transcripts across tissues (ie. mean or median).

    :param str gtex_rsem_path: Output of RSEM quantifications from GTEx
    Example: "gs://gnomad-berylc/reheadered.GTEx_Analysis_2016-09-07_RSEMv1.2.22_transcript_tpm.txt.bgz"
    :param str gtex_tx_summary_out_path: Path to write out.
    Example: "gs://gnomad-berylc/tx-annotation/hail2/GTEx.V7.tx_medians.030818.mt"
    :param bool get_medians: Default True. If False, returns mean transcript expression per tissue
    :return: Writes out summarized GTEx transcript expression as Table.
    :rtype: None
    """

    gtex = hl.import_matrix_table(gtex_rsem_path,
                                  row_key='transcript_id',
                                  row_fields={
                                      'transcript_id': hl.tstr,
                                      'gene_id': hl.tstr
                                  },
                                  entry_type=hl.tfloat64)

    gtex = gtex.annotate_cols(tissue=gtex.col_id.split("\\.")[0])

    if get_medians:
        gtex = gtex.group_cols_by(gtex.tissue).aggregate(
            median_tx_expr=hl.median(agg.collect(gtex.x)))
    else:
        gtex = gtex.group_cols_by(
            gtex.tissue).aggregate(mean_tx_expr=hl.mean(agg.collect(gtex.x)))

    # Make a new column as an array of the values across tissues (per transcript)
    gtex = gtex.annotate_rows(agg_expression=agg.collect(gtex.median_tx_expr))

    # Modify the gtex table to remove version numbers
    gtex = gtex.annotate_rows(transcript_id=gtex.transcript_id.split("\\.")[0])
    gtex = gtex.annotate_rows(gene_id=gtex.gene_id.split("\\.")[0])

    gtex.write(gtex_tx_summary_out_path, overwrite=True)
Beispiel #7
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl
    doctest_namespace['np'] = np

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")
Beispiel #8
0
import hail as hl

root = 'gs://hail-datasets-raw-data/LDSC/baselineLD_v2.2'

mt = hl.import_matrix_table(f'{root}/ld_scores.GRCh37.tsv.bgz',
    row_fields={'CHR': hl.tstr, 'SNP': hl.tstr, 'BP': hl.tint}, entry_type=hl.tstr)

mt = mt.annotate_entries(x=hl.float(mt['x']))
mt = mt.annotate_rows(
    locus=hl.locus(mt['CHR'], mt['BP'], 'GRCh37'))
mt = mt.key_rows_by('locus')
mt = mt.select_rows('SNP')

M = hl.import_table(
    f'{root}/M.GRCh37.tsv.bgz', key='annotation')
M_5_50 = hl.import_table(
    f'{root}/M_5_50.GRCh37.tsv.bgz', key='annotation')

mt = mt.rename({'col_id': 'annotation'})
mt = mt.annotate_cols(
    M_5_50=hl.int(hl.float(M_5_50[mt.annotation].M_5_50)),
    M=hl.int(hl.float(M[mt.annotation].M)))

n_rows, n_cols = mt.count()
n_partitions = mt.n_partitions()

mt = mt.annotate_globals(
    metadata=hl.struct(
        name='LDSC_baselineLD_v2.2_ld_scores',
        reference_genome='GRCh37',
def main(args):
    input_tsv = args.input_tsv
    output_ht = args.output_ht
    chunk_size = args.chunk_size
    overwrite = args.overwrite

    mt_list = []
    logger.info(
        "Reading in individual coverage files as matrix tables and adding to a list of matrix tables..."
    )
    with open(input_tsv, "r") as f:
        #next(f)
        for line in f:
            line = line.rstrip()
            items = line.split("\t")
            sample, base_level_coverage_metrics = items[0:2]
            #print(sample)
            #print(base_level_coverage_metrics)

            mt = hl.import_matrix_table(
                base_level_coverage_metrics,
                delimiter="\t",
                row_fields={
                    "chrom": hl.tstr,
                    "pos": hl.tint,
                    "target": hl.tstr
                },
                row_key=["chrom", "pos"],
            ).drop("target")
            mt = mt.rename({"x": "coverage"})
            mt = mt.key_cols_by(s=sample)
            mt_list.append(mt)

    logger.info("Joining individual coverage mts...")
    out_dir = dirname(output_ht)
    temp_out_dir = out_dir + "/temp"

    cov_mt = multi_way_union_mts(mt_list, temp_out_dir, chunk_size)
    n_samples = cov_mt.count_cols()

    logger.info("Adding coverage annotations...")
    cov_mt = cov_mt.annotate_rows(
        locus=hl.locus(cov_mt.chrom, cov_mt.pos, reference_genome="GRCh38"),
        mean=hl.float(hl.agg.mean(cov_mt.coverage)),
        median=hl.median(hl.agg.collect(cov_mt.coverage)),
        over_100=hl.float(
            (hl.agg.count_where(cov_mt.coverage > 100) / n_samples)),
        over_1000=hl.float(
            (hl.agg.count_where(cov_mt.coverage > 1000) / n_samples)),
    )
    cov_mt.show()

    cov_mt = cov_mt.key_rows_by("locus").drop("chrom", "pos")

    output_mt = re.sub("\.ht$", ".mt", output_ht)
    output_tsv = re.sub("\.ht$", ".tsv", output_ht)
    output_samples = re.sub("\.ht$", "_sample_level.txt", output_ht)

    logger.info("Writing sample level coverage...")
    sample_mt = cov_mt.key_rows_by(pos=cov_mt.locus.position)
    sample_mt.coverage.export(output_samples)

    logger.info("Writing coverage mt and ht...")
    cov_mt.write(output_mt, overwrite=overwrite)
    cov_ht = cov_mt.rows()
    cov_ht = cov_ht.checkpoint(output_ht, overwrite=overwrite)
    cov_ht.export(output_tsv)
# # Load in the dosage files from Tractor
# ## First implementing the GWAS version that includes the full VCF but with dosage calls per ancestry

# In[6]:

row_fields = {
    'CHROM': hl.tstr,
    'POS': hl.tint,
    'ID': hl.tstr,
    'REF': hl.tstr,
    'ALT': hl.tstr
}
anc0dos = hl.import_matrix_table(
    'gs://ukb-diverse-pops/AdmixedAfrEur/DosageFiles/UKBB_AfEur_QCed_lipids.autosomes.anc0.dosage_v1.txt.gz',
    force_bgz=True,
    row_fields=row_fields,
    row_key=[],
    min_partitions=32)
anc0dos = anc0dos.key_rows_by().drop('row_id')
anc0dos = anc0dos.key_rows_by(locus=hl.locus(anc0dos.CHROM, anc0dos.POS))

# In[7]:

row_fields = {
    'CHROM': hl.tstr,
    'POS': hl.tint,
    'ID': hl.tstr,
    'REF': hl.tstr,
    'ALT': hl.tstr
}
anc1dos = hl.import_matrix_table(
Beispiel #11
0
    ht_genes = import_gtf(path=EXTRACT_BUCKET +
                          'GTEx/v7/GTEx_genes.v7.GRCh37.gtf.bgz',
                          reference_genome='GRCh37')
    ht_genes = ht_genes.filter(ht_genes['feature'] == 'gene')
    ht_genes = ht_genes.key_by(ht_genes['gene_id'])
    ht_genes = ht_genes.select('interval', 'strand', 'gene_name',
                               'havana_gene', 'gene_type', 'gene_status',
                               'level', 'tag')
    ht_genes = ht_genes.rename({'interval': 'gene_interval'})
    ht_genes = ht_genes.distinct()

    mt_counts = hl.import_matrix_table(
        EXTRACT_BUCKET + 'GTEx/v7/GTEx_gene_read_counts.v7.GRCh37.tsv.bgz',
        row_fields={
            'Name': hl.tstr,
            'Description': hl.tstr
        },
        row_key='Name',
        missing=' ',
        entry_type=hl.tfloat)
    mt_counts = mt_counts.drop('Description')
    mt_counts = mt_counts.transmute_entries(read_count=hl.int(mt_counts['x']))
    mt_counts = mt_counts.rename({'col_id': 'sample_id', 'Name': 'gene_id'})

    mt_tpm = hl.import_matrix_table(EXTRACT_BUCKET +
                                    'GTEx/v7/GTEx_gene_tpm.v7.GRCh37.tsv.bgz',
                                    row_fields={
                                        'Name': hl.tstr,
                                        'Description': hl.tstr
                                    },
                                    row_key='Name',
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    bed_to_exclude_pca = hl.import_bed(
        f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38')
    cohorts_pop = hl.import_table(
        "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb_elgh_labels_updated.tsv",
        delimiter="\t").key_by('s')

    # s3a://DDD-ELGH-UKBB-exomes/ancestry/WES_AKT_1kg_intersection.vcf.mt
    # # overlap AKT dataset
    overlap_1kg_AKT = hl.import_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/WES_AKT_1kg_intersection.mt")
    # drop cohorts
    # annotate with cohorts and populations from s3 table.
    # save matrixtable
    mt = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt"
    )
    mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort)
    mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population)
    mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated)
    mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation)
    mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)
    mt.write(
        f"{tmp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt",
        overwrite=True)
    # filter matrixtable
def main(args):

    bed_to_exclude_pca = hl.import_bed(locations_exclude_from_pca,
                                       reference_genome='GRCh38')
    cohorts_pop = hl.import_table(cohorts_populations,
                                  delimiter="\t").key_by('s')

    # # overlap AKT dataset
    overlap_1kg_AKT = hl.import_matrix_table(AKT_overlap)
    # drop cohorts
    # annotate with cohorts and populations from s3 table.
    # save matrixtable
    mt = hl.read_matrix_table(args.matrixtable)
    mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort)
    mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population)
    mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated)
    # mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation)
    mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)
    mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt",
        overwrite=True)
    # filter matrixtable
    logger.info("wrote mt ")
    # filter mt
    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1]))
    mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail')
    # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm.
    mt_vqc_filtered = mt_vqc.filter_rows(
        (mt_vqc.variant_QC_Hail.call_rate >= 0.99)
        & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05)
        & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95))
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined(
        bed_to_exclude_pca[mt_vqc_filtered.locus]),
                                                  keep=False)
    # overlap AKT dataset:
    # overlap_1kg_AKT
    # mt_1kg_chr1_chr20 = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt")
    overlap_1kg_AKT = overlap_1kg_AKT.key_rows_by("locus")
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(
        hl.is_defined(overlap_1kg_AKT.rows()[mt_vqc_filtered.locus]))
    logger.info("done filtering writing mt")
    # ld pruning
    pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000)
    #pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt_vqc_filtered.filter_rows(
        hl.is_defined(pruned_ht[mt_vqc_filtered.row_key]))
    # remove pruned areas that need to be removed

    # autosomes only:
    pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome())

    pruned_mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt",
        overwrite=True)
    # pruned_mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt")

    # related_samples_to_drop = hl.read_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht")

    logger.info("run_pca_with_relateds")
    # pca_evals, pca_scores, pca_loadings = run_pca_with_relateds(
    #    pruned_mt, related_samples_to_drop, autosomes_only=True)
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        pruned_mt.GT, k=10, compute_loadings=True)
    pca_scores = pca_scores.annotate(
        known_pop=pruned_mt.cols()[pca_scores.s].known_pop)
    # mt = mt.annotate_cols(
    #    loadings=pca_loadings[mt_vqc_filtered.col_key].loadings)
    # mt = mt.annotate_cols(known_pop="unk")
    # pca_scores = pca_scores.annotate(known_pop="unk")
    pca_scores.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_scores_after_pruning.ht",
        overwrite=True)
    pca_loadings.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_loadings_after_pruning.ht",
        overwrite=True)
    with open(f"{args.output_dir}/ddd-elgh-ukbb/pca_evals_after_pruning.txt",
              'w') as f:
        for val in pca_evals:
            f.write(str(val))

    logger.info("assign population pcs")

    pop_ht, pop_clf = assign_population_pcs(pca_scores,
                                            pca_scores.scores,
                                            known_col="known_pop",
                                            n_estimators=100,
                                            prop_train=0.8,
                                            min_prob=0.5)
    pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.ht",
                 overwrite=True)
    pop_ht.export(
        f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.tsv.gz")
                                        'f3': 'start',
                                        'f4': 'end',
                                        'f5': 'score',
                                        'f6': 'strand',
                                        'f7': 'phase',
                                        'f8': 'attributes'})

ht_transcripts = ht_transcripts.filter(ht_transcripts.feature_type == 'transcript')
ht_transcripts = ht_transcripts.annotate(interval=hl.interval(hl.locus(ht_transcripts.contig, ht_transcripts.start, 'GRCh37'), hl.locus(ht_transcripts.contig, ht_transcripts.end + 1, 'GRCh37')))
ht_transcripts = ht_transcripts.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht_transcripts.attributes.split('; '))))
attribute_cols = list(ht_transcripts.aggregate(hl.set(hl.flatten(hl.agg.collect(ht_transcripts.attributes.keys())))))
ht_transcripts = ht_transcripts.annotate(**{x: hl.or_missing(ht_transcripts.attributes.contains(x), ht_transcripts.attributes[x]) for x in attribute_cols})
ht_transcripts = ht_transcripts.select(*(['transcript_id', 'transcript_name', 'transcript_type', 'strand', 'transcript_status', 'havana_transcript', 'ccdsid', 'ont', 'gene_name', 'interval', 'gene_type', 'annotation_source', 'havana_gene', 'gene_status', 'tag']))
ht_transcripts = ht_transcripts.rename({'havana_transcript': 'havana_transcript_id', 'havana_gene': 'havana_gene_id'})
ht_transcripts = ht_transcripts.key_by(ht_transcripts.transcript_id)

mt = hl.import_matrix_table('gs://hail-datasets/raw-data/gtex/v7/rna-seq/processed/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_expected_count.tsv.bgz',
                            row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr}, row_key='transcript_id', missing='', entry_type=hl.tfloat)

mt = mt.annotate_cols(sample_id=mt.col_id)
mt = mt.key_cols_by(mt.sample_id)

mt = mt.annotate_entries(read_count=hl.int(mt.x))
mt = mt.drop(mt.col_id, mt.x)

mt = mt.annotate_cols(**ht_samples[mt.sample_id])
mt = mt.annotate_rows(**ht_transcripts[mt.transcript_id])

mt.describe()
mt.write('gs://hail-datasets/hail-data/gtex_v7_transcript_read_counts.GRCh37.mt', overwrite=True)
Beispiel #15
0
                           'havana_gene', 'gene_type', 'gene_status', 'level',
                           'score', 'strand', 'frame', 'tag')
ht_genes = ht_genes.rename({
    'gene_name': 'gene_symbol',
    'havana_gene': 'havana_gene_id'
})
ht_genes.write('hdfs:///tmp/genes.ht', overwrite=True)
ht_genes = hl.read_table('hdfs:///tmp/genes.ht')

# gene read counts
name = 'GTEx_RNA_seq_gene_read_counts'
mt = hl.import_matrix_table(
    f'{raw_data_root}/GTEx_v7_RNA_seq_gene_read_counts.tsv.bgz',
    row_fields={
        'Name': hl.tstr,
        'Description': hl.tstr
    },
    row_key='Name',
    entry_type=hl.tstr,
    missing=' ')
mt = mt.select_entries(read_count=hl.int(hl.float(mt.x)))
mt = mt.rename({
    'Name': 'gene_id',
    'Description': 'gene_symbol',
    'col_id': 's'
})
mt = mt.annotate_cols(subject_id=hl.delimit(mt['s'].split('-')[:2], '-'))
mt = mt.annotate_cols(**ht_samples[mt.s])
mt = mt.annotate_cols(**ht_subjects[mt.subject_id])
mt = mt.annotate_rows(**ht_genes[mt.gene_id])
Beispiel #16
0
hl.init()


#load in plotting features
from hail.plot import show
from pprint import pprint
hl.plot.output_notebook()


# # Load in the dosage files from Tractor
# ### Note: this will be the most time intensive step. Hail team is actively optimizing pieces of this infrastructure.
## user should modify the paths in the import steps to match the location (here shown for files on google cloud) of their datasets

#start loading in the ancestry 0 minor allele dosages
row_fields={'CHROM': hl.tstr, 'POS': hl.tint, 'ID': hl.tstr, 'REF': hl.tstr, 'ALT': hl.tstr} 
anc0dos = hl.import_matrix_table('gs://.../Dataset.anc0.dosage.txt.gz', force_bgz=True, row_fields=row_fields, row_key=[], min_partitions=32) 
anc0dos = anc0dos.key_rows_by().drop('row_id')
anc0dos = anc0dos.key_rows_by(locus=hl.locus(anc0dos.CHROM, anc0dos.POS))     


#also load ancestry 1 allele dosages
row_fields={'CHROM': hl.tstr, 'POS': hl.tint, 'ID': hl.tstr, 'REF': hl.tstr, 'ALT': hl.tstr} 
anc1dos = hl.import_matrix_table('gs://.../Dataset.anc1.dosage.txt.gz', force_bgz=True, row_fields=row_fields, row_key=[], min_partitions=32) 
anc1dos = anc1dos.key_rows_by().drop('row_id')
anc1dos = anc1dos.key_rows_by(locus=hl.locus(anc1dos.CHROM, anc1dos.POS))     

#Optional - save these temporary files to relieve  memory burden
anc0dos = anc0dos.checkpoint('gs://.../Dataset.anc0.dosage.mt')
anc1dos = anc1dos.checkpoint('gs://.../Dataset.anc1.dosage.mt')