Esempio n. 1
0
 def test_multi_write(self):
     mt = self.get_vds()
     f = new_temp_file()
     hl.experimental.write_matrix_tables([mt, mt], f)
     path1 = f + '0.mt'
     path2 = f + '1.mt'
     mt1 = hl.read_matrix_table(path1)
     mt2 = hl.read_matrix_table(path2)
     self.assertTrue(mt._same(mt1))
     self.assertTrue(mt._same(mt2))
     self.assertTrue(mt1._same(mt2))
Esempio n. 2
0
    def test_backward_compatability(self):
        import os

        all_values_table, all_values_matrix_table = create_all_values_datasets()

        table_dir = resource('backward_compatability/1.0.0/table')
        matrix_table_dir = resource('backward_compatability/1.0.0/matrix_table')

        n = 0
        i = 0
        f = os.path.join(table_dir, '{}.ht'.format(i))
        while os.path.exists(f):
            ds = hl.read_table(f)
            self.assertTrue(ds._same(all_values_table))
            i += 1
            f = os.path.join(table_dir, '{}.ht'.format(i))
            n += 1

        i = 0
        f = os.path.join(matrix_table_dir, '{}.hmt'.format(i))
        while os.path.exists(f):
            ds = hl.read_matrix_table(f)
            self.assertTrue(ds._same(all_values_matrix_table))
            i += 1
            f = os.path.join(matrix_table_dir, '{}.hmt'.format(i))
            n += 1

        self.assertEqual(n, 8)
Esempio n. 3
0
    def test_write_stage_locally(self):
        mt = self.get_vds()
        f = new_temp_file(suffix='mt')
        mt.write(f, stage_locally=True)

        mt2 = hl.read_matrix_table(f)
        self.assertTrue(mt._same(mt2))
Esempio n. 4
0
 def test_codecs_matrix(self):
     from hail.utils.java import scala_object
     codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs()
     ds = self.get_vds()
     temp = new_temp_file(suffix='hmt')
     for codec in codecs:
         ds.write(temp, overwrite=True, _codec_spec=codec.toString())
         ds2 = hl.read_matrix_table(temp)
         self.assertTrue(ds._same(ds2))
Esempio n. 5
0
 def test_fix3307_read_mt_wrong(self):
     mt = hl.import_vcf(resource('sample2.vcf'))
     mt = hl.split_multi_hts(mt)
     mt.write('/tmp/foo.mt', overwrite=True)
     mt2 = hl.read_matrix_table('/tmp/foo.mt')
     t = hl.read_table('/tmp/foo.mt/rows')
     self.assertTrue(mt.rows()._same(t))
     self.assertTrue(mt2.rows()._same(t))
     self.assertTrue(mt._same(mt2))
Esempio n. 6
0
def run_combiner(sample_list, json, out_path, tmp_path, summary_path=None, overwrite=False):
    import gc
    # make the temp path a directory, no matter what
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    vcfs = [comb.transform_one(vcf)
            for vcf in hl.import_vcfs(sample_list, json, array_elements_required=False)]
    combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH)]
    if len(combined) == 1:
        combined[0].write(out_path, overwrite=overwrite)
    else:
        hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}')
        i = 0
        while len(combined) > 1:
            pad = len(str(len(combined)))
            hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True)
            paths = [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))]
            i += 1
            wmts = [hl.read_matrix_table(path) for path in paths]
            combined = [comb.combine_gvcfs(mts) for mts in chunks(wmts, MAX_COMBINER_LENGTH)]
            gc.collect()  # need to try to free memory on the master
        combined[0].write(out_path, overwrite=overwrite)
    if summary_path is not None:
        mt = hl.read_matrix_table(out_path)
        comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)
Esempio n. 7
0
    def test_head(self):
        # no empty partitions
        mt1 = hl.utils.range_matrix_table(10, 10)

        # empty partitions at front
        mt2 = hl.utils.range_matrix_table(20, 10, 20)
        mt2 = mt2.filter_rows(mt2.row_idx > 9)
        mts = [mt1, mt2]

        for mt in mts:
            tmp_file = new_temp_file(suffix='mt')

            mt.write(tmp_file)
            mt_readback = hl.read_matrix_table(tmp_file)
            for mt_ in [mt, mt_readback]:
                assert mt_.head(1).count_rows() == 1
                assert mt_.head(1)._force_count_rows() == 1
                assert mt_.head(100).count_rows() == 10
                assert mt_.head(100)._force_count_rows() == 10
Esempio n. 8
0
File: utils.py Progetto: jigold/hail
def download_data():
    global _initialized, _data_dir, _mt
    _data_dir = os.environ.get('HAIL_BENCHMARK_DIR', '/tmp/hail_benchmark_data')
    print(f'using benchmark data directory {_data_dir}')
    os.makedirs(_data_dir, exist_ok=True)

    files = map(lambda f: os.path.join(_data_dir, f), ['profile.vcf.bgz', 'profile.mt'])
    if not all(os.path.exists(file) for file in files):
        vcf = os.path.join(_data_dir, 'profile.vcf.bgz')
        print('files not found - downloading...', end='',flush=True)
        urlretrieve('https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz',
                    os.path.join(_data_dir, vcf))
        print('done', flush=True)
        print('importing...', end='', flush=True)
        hl.import_vcf(vcf).write(os.path.join(_data_dir, 'profile.mt'))
        print('done', flush=True)
    else:
        print('all files found.', flush=True)

    _initialized = True
    _mt = hl.read_matrix_table(resource('profile.mt'))
Esempio n. 9
0
def split_multi_hts():
    mt = hl.read_matrix_table(resource('profile.mt'))
    hl.split_multi_hts(mt)._force_count_rows()
Esempio n. 10
0
    def test_ld_score_regression(self):

        ht_scores = hl.import_table(
            doctest_resource('ld_score_regression.univariate_ld_scores.tsv'),
            key='SNP', types={'L2': hl.tfloat, 'BP': hl.tint})

        ht_50_irnt = hl.import_table(
            doctest_resource('ld_score_regression.50_irnt.sumstats.tsv'),
            key='SNP', types={'N': hl.tint, 'Z': hl.tfloat})

        ht_50_irnt = ht_50_irnt.annotate(
            chi_squared=ht_50_irnt['Z']**2,
            n=ht_50_irnt['N'],
            ld_score=ht_scores[ht_50_irnt['SNP']]['L2'],
            locus=hl.locus(ht_scores[ht_50_irnt['SNP']]['CHR'],
                           ht_scores[ht_50_irnt['SNP']]['BP']),
            alleles=hl.array([ht_50_irnt['A2'], ht_50_irnt['A1']]),
            phenotype='50_irnt')

        ht_50_irnt = ht_50_irnt.key_by(ht_50_irnt['locus'],
                                       ht_50_irnt['alleles'])

        ht_50_irnt = ht_50_irnt.select(ht_50_irnt['chi_squared'],
                                       ht_50_irnt['n'],
                                       ht_50_irnt['ld_score'],
                                       ht_50_irnt['phenotype'])

        ht_20160 = hl.import_table(
            doctest_resource('ld_score_regression.20160.sumstats.tsv'),
            key='SNP', types={'N': hl.tint, 'Z': hl.tfloat})

        ht_20160 = ht_20160.annotate(
            chi_squared=ht_20160['Z']**2,
            n=ht_20160['N'],
            ld_score=ht_scores[ht_20160['SNP']]['L2'],
            locus=hl.locus(ht_scores[ht_20160['SNP']]['CHR'],
                           ht_scores[ht_20160['SNP']]['BP']),
            alleles=hl.array([ht_20160['A2'], ht_20160['A1']]),
            phenotype='20160')

        ht_20160 = ht_20160.key_by(ht_20160['locus'],
                                   ht_20160['alleles'])

        ht_20160 = ht_20160.select(ht_20160['chi_squared'],
                                   ht_20160['n'],
                                   ht_20160['ld_score'],
                                   ht_20160['phenotype'])

        ht = ht_50_irnt.union(ht_20160)
        mt = ht.to_matrix_table(row_key=['locus', 'alleles'],
                                col_key=['phenotype'],
                                row_fields=['ld_score'],
                                col_fields=[])

        mt_tmp = new_temp_file()
        mt.write(mt_tmp, overwrite=True)
        mt = hl.read_matrix_table(mt_tmp)

        ht_results = hl.experimental.ld_score_regression(
            weight_expr=mt['ld_score'],
            ld_score_expr=mt['ld_score'],
            chi_sq_exprs=mt['chi_squared'],
            n_samples_exprs=mt['n'],
            n_blocks=20,
            two_step_threshold=5,
            n_reference_panel_variants=1173569)

        results = {
            x['phenotype']: {
                'mean_chi_sq': x['mean_chi_sq'],
                'intercept_estimate': x['intercept']['estimate'],
                'intercept_standard_error': x['intercept']['standard_error'],
                'snp_heritability_estimate': x['snp_heritability']['estimate'],
                'snp_heritability_standard_error':
                    x['snp_heritability']['standard_error']}
            for x in ht_results.collect()}

        self.assertAlmostEqual(
            results['50_irnt']['mean_chi_sq'],
            3.4386, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['intercept_estimate'],
            0.7727, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['intercept_standard_error'],
            0.2461, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['snp_heritability_estimate'],
            0.3845, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['snp_heritability_standard_error'],
            0.1067, places=4)

        self.assertAlmostEqual(
            results['20160']['mean_chi_sq'],
            1.5209, places=4)
        self.assertAlmostEqual(
            results['20160']['intercept_estimate'],
            1.2109, places=4)
        self.assertAlmostEqual(
            results['20160']['intercept_standard_error'],
            0.2238, places=4)
        self.assertAlmostEqual(
            results['20160']['snp_heritability_estimate'],
            0.0486, places=4)
        self.assertAlmostEqual(
            results['20160']['snp_heritability_standard_error'],
            0.0416, places=4)

        ht = ht_50_irnt.annotate(
            chi_squared_50_irnt=ht_50_irnt['chi_squared'],
            n_50_irnt=ht_50_irnt['n'],
            chi_squared_20160=ht_20160[ht_50_irnt.key]['chi_squared'],
            n_20160=ht_20160[ht_50_irnt.key]['n'])

        ht_results = hl.experimental.ld_score_regression(
            weight_expr=ht['ld_score'],
            ld_score_expr=ht['ld_score'],
            chi_sq_exprs=[ht['chi_squared_50_irnt'],
                               ht['chi_squared_20160']],
            n_samples_exprs=[ht['n_50_irnt'],
                             ht['n_20160']],
            n_blocks=20,
            two_step_threshold=5,
            n_reference_panel_variants=1173569)

        results = {
            x['phenotype']: {
                'mean_chi_sq': x['mean_chi_sq'],
                'intercept_estimate': x['intercept']['estimate'],
                'intercept_standard_error': x['intercept']['standard_error'],
                'snp_heritability_estimate': x['snp_heritability']['estimate'],
                'snp_heritability_standard_error':
                    x['snp_heritability']['standard_error']}
            for x in ht_results.collect()}

        self.assertAlmostEqual(
            results[0]['mean_chi_sq'],
            3.4386, places=4)
        self.assertAlmostEqual(
            results[0]['intercept_estimate'],
            0.7727, places=4)
        self.assertAlmostEqual(
            results[0]['intercept_standard_error'],
            0.2461, places=4)
        self.assertAlmostEqual(
            results[0]['snp_heritability_estimate'],
            0.3845, places=4)
        self.assertAlmostEqual(
            results[0]['snp_heritability_standard_error'],
            0.1067, places=4)

        self.assertAlmostEqual(
            results[1]['mean_chi_sq'],
            1.5209, places=4)
        self.assertAlmostEqual(
            results[1]['intercept_estimate'],
            1.2109, places=4)
        self.assertAlmostEqual(
            results[1]['intercept_standard_error'],
            0.2238, places=4)
        self.assertAlmostEqual(
            results[1]['snp_heritability_estimate'],
            0.0486, places=4)
        self.assertAlmostEqual(
            results[1]['snp_heritability_standard_error'],
            0.0416, places=4)
Esempio n. 11
0
def matrix_table_rows_show(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.rows().show(100)
Esempio n. 12
0
def init(doctest_namespace):
    # This gets run once per process -- must avoid race conditions
    print("setting up doctest...")

    olddir = os.getcwd()
    os.chdir("docs/")

    doctest_namespace['hl'] = hl
    doctest_namespace['agg'] = agg

    if not os.path.isdir("output/"):
        try:
            os.mkdir("output/")
        except OSError:
            pass

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.read_matrix_table('data/example.vds')
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv', impute=True,
                             types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                    'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                    'E': hl.tstruct(A=hl.tint32, B=hl.tint32)})
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+',
                                   types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)},
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44})
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    print("finished setting up doctest...")
    yield
    os.chdir(olddir)
Esempio n. 13
0
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:

    .. math::

        \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j

    *  :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic
       for variant :math:`j` resulting from a test of association between
       variant :math:`j` and a trait.
    *  :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant
       :math:`j`, calculated as the sum of squared correlation coefficients
       between variant :math:`j` and nearby variants. See :func:`ld_score`
       for further details.
    *  :math:`a` captures the contribution of confounding biases, such as
       cryptic relatedness and uncontrolled population structure, to the
       association test statistic.
    *  :math:`h_g^2` is the SNP-heritability, or the proportion of variation
       in the trait explained by the effects of variants included in the
       regression model above.
    *  :math:`M` is the number of variants used to estimate :math:`h_g^2`.
    *  :math:`N` is the number of samples in the underlying association study.

    For more details on the method implemented in this function, see:

    * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__

    Examples
    --------

    Run the method on a matrix table of summary statistics, where the rows
    are variants and the columns are different phenotypes:

    >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=mt_gwas['ld_score'],
    ...     ld_score_expr=mt_gwas['ld_score'],
    ...     chi_sq_exprs=mt_gwas['chi_squared'],
    ...     n_samples_exprs=mt_gwas['n'])


    Run the method on a table with summary statistics for a single
    phenotype:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=ht_gwas['chi_squared_50_irnt'],
    ...     n_samples_exprs=ht_gwas['n_50_irnt'])

    Run the method on a table with summary statistics for multiple
    phenotypes:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'],
    ...                        ht_gwas['chi_squared_20160']],
    ...     n_samples_exprs=[ht_gwas['n_50_irnt'],
    ...                      ht_gwas['n_20160']])

    Notes
    -----
    The ``exprs`` provided as arguments to :func:`.ld_score_regression`
    must all be from the same object, either a :class:`Table` or a
    :class:`MatrixTable`.

    **If the arguments originate from a table:**

    *  The table must be keyed by fields ``locus`` of type
       :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of
       :py:data:`.tstr` elements.
    *  ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and
       ``n_samples_exprs`` are must be row-indexed fields.
    *  The number of expressions passed to ``n_samples_exprs`` must be
       equal to one or the number of expressions passed to
       ``chi_sq_exprs``. If just one expression is passed to
       ``n_samples_exprs``, that sample size expression is assumed to
       apply to all sets of statistics passed to ``chi_sq_exprs``.
       Otherwise, the expressions passed to ``chi_sq_exprs`` and
       ``n_samples_exprs`` are matched by index.
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have generic :obj:`int` values
       ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc.
       expressions passed to the ``chi_sq_exprs`` argument.

    **If the arguments originate from a matrix table:**

    *  The dimensions of the matrix table must be variants
       (rows) by phenotypes (columns).
    *  The rows of the matrix table must be keyed by fields
       ``locus`` of type :class:`.tlocus` and ``alleles``,
       a :py:data:`.tarray` of :py:data:`.tstr` elements.
    *  The columns of the matrix table must be keyed by a field
       of type :py:data:`.tstr` that uniquely identifies phenotypes
       represented in the matrix table. The column key must be a single
       expression; compound keys are not accepted.
    *  ``weight_expr`` and ``ld_score_expr`` must be row-indexed
       fields.
    *  ``chi_sq_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  ``n_samples_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have values corresponding to the
       column keys of the input matrix table.

    This function returns a :class:`Table` with one row per set of summary
    statistics passed to the ``chi_sq_exprs`` argument. The following
    row-indexed fields are included in the table:

    *  **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The
       returned table is keyed by this field. See the notes below for
       details on the possible values of this field.
    *  **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared
       test statistic for the given phenotype.
    *  **intercept** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          intercept :math:`1 + Na`.
       -  **standard_error**  (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    *  **snp_heritability** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          SNP-heritability :math:`h_g^2`.
       -  **standard_error** (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    Warning
    -------
    :func:`.ld_score_regression` considers only the rows for which both row
    fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing
    values in either field are removed prior to fitting the LD score
    regression model.

    Parameters
    ----------
    weight_expr : :class:`.Float64Expression`
                  Row-indexed expression for the LD scores used to derive
                  variant weights in the model.
    ld_score_expr : :class:`.Float64Expression`
                    Row-indexed expression for the LD scores used as covariates
                    in the model.
    chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of
                        :class:`.Float64Expression`
                        One or more row-indexed (if table) or entry-indexed
                        (if matrix table) expressions for chi-squared
                        statistics resulting from genome-wide association
                        studies.
    n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of
                     :class:`.NumericExpression`
                     One or more row-indexed (if table) or entry-indexed
                     (if matrix table) expressions indicating the number of
                     samples used in the studies that generated the test
                     statistics supplied to ``chi_sq_exprs``.
    n_blocks : :obj:`int`
               The number of blocks used in the jackknife approach to
               estimating standard errors.
    two_step_threshold : :obj:`int`
                         Variants with chi-squared statistics greater than this
                         value are excluded in the first step of the two-step
                         procedure used to fit the model.
    n_reference_panel_variants : :obj:`int`, optional
                                 Number of variants used to estimate the
                                 SNP-heritability :math:`h_g^2`.

    Returns
    -------
    :class:`.Table`
        Table keyed by ``phenotype`` with intercept and heritability estimates
        for each phenotype passed to the function."""

    chi_sq_exprs = wrap_to_list(chi_sq_exprs)
    n_samples_exprs = wrap_to_list(n_samples_exprs)

    assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or
            (len(n_samples_exprs) == 1))
    __k = 2  # number of covariates, including intercept

    ds = chi_sq_exprs[0]._indices.source

    analyze('ld_score_regression/weight_expr',
            weight_expr,
            ds._row_indices)
    analyze('ld_score_regression/ld_score_expr',
            ld_score_expr,
            ds._row_indices)

    # format input dataset
    if isinstance(ds, MatrixTable):
        if len(chi_sq_exprs) != 1:
            raise ValueError("""Only one chi_sq_expr allowed if originating
                from a matrix table.""")
        if len(n_samples_exprs) != 1:
            raise ValueError("""Only one n_samples_expr allowed if
                originating from a matrix table.""")

        col_key = list(ds.col_key)
        if len(col_key) != 1:
            raise ValueError("""Matrix table must be keyed by a single
                phenotype field.""")

        analyze('ld_score_regression/chi_squared_expr',
                chi_sq_exprs[0],
                ds._entry_indices)
        analyze('ld_score_regression/n_samples_expr',
                n_samples_exprs[0],
                ds._entry_indices)

        ds = ds._select_all(row_exprs={'__locus': ds.locus,
                                       '__alleles': ds.alleles,
                                       '__w_initial': weight_expr,
                                       '__w_initial_floor': hl.max(weight_expr,
                                                                   1.0),
                                       '__x': ld_score_expr,
                                       '__x_floor': hl.max(ld_score_expr,
                                                           1.0)},
                            row_key=['__locus', '__alleles'],
                            col_exprs={'__y_name': ds[col_key[0]]},
                            col_key=['__y_name'],
                            entry_exprs={'__y': chi_sq_exprs[0],
                                         '__n': n_samples_exprs[0]})
        ds = ds.annotate_entries(**{'__w': ds.__w_initial})

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    else:
        assert isinstance(ds, Table)
        for y in chi_sq_exprs:
            analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices)
        for n in n_samples_exprs:
            analyze('ld_score_regression/n_samples_expr', n, ds._row_indices)

        ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)]

        ds = ds.select(**dict(**{'__locus': ds.locus,
                                 '__alleles': ds.alleles,
                                 '__w_initial': weight_expr,
                                 '__x': ld_score_expr},
                              **{y: chi_sq_exprs[i]
                                 for i, y in enumerate(ys)},
                              **{w: weight_expr for w in ws},
                              **{n: n_samples_exprs[i]
                                 for i, n in enumerate(ns)}))
        ds = ds.key_by(ds.__locus, ds.__alleles)

        table_tmp_file = new_temp_file()
        ds.write(table_tmp_file)
        ds = hl.read_table(table_tmp_file)

        hts = [ds.select(**{'__w_initial': ds.__w_initial,
                            '__w_initial_floor': hl.max(ds.__w_initial,
                                                        1.0),
                            '__x': ds.__x,
                            '__x_floor': hl.max(ds.__x, 1.0),
                            '__y_name': i,
                            '__y': ds[ys[i]],
                            '__w': ds[ws[i]],
                            '__n': hl.int(ds[ns[i]])})
               for i, y in enumerate(ys)]

        mts = [ht.to_matrix_table(row_key=['__locus',
                                           '__alleles'],
                                  col_key=['__y_name'],
                                  row_fields=['__w_initial',
                                              '__w_initial_floor',
                                              '__x',
                                              '__x_floor'])
               for ht in hts]

        ds = mts[0]
        for i in range(1, len(ys)):
            ds = ds.union_cols(mts[i])

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    mt_tmp_file1 = new_temp_file()
    ds.write(mt_tmp_file1)
    mt = hl.read_matrix_table(mt_tmp_file1)

    if not n_reference_panel_variants:
        M = mt.count_rows()
    else:
        M = n_reference_panel_variants

    # block variants for each phenotype
    n_phenotypes = mt.count_cols()

    mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) &
                                         (mt.__y < two_step_threshold)),
                             __in_step2=hl.is_defined(mt.__y))

    mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()),
                          __m_step1=hl.agg.count_where(mt.__in_step1),
                          __m_step2=hl.agg.count_where(mt.__in_step2))

    col_keys = list(mt.col_key)

    ht = mt.localize_entries(entries_array_field_name='__entries',
                             columns_array_field_name='__cols')

    ht = ht.annotate(__entries=hl.rbind(
        hl.scan.array_agg(
            lambda entry: hl.scan.count_where(entry.__in_step1),
            ht.__entries),
        lambda step1_indices: hl.map(
            lambda i: hl.rbind(
                hl.int(hl.or_else(step1_indices[i], 0)),
                ht.__cols[i].__m_step1,
                ht.__entries[i],
                lambda step1_idx, m_step1, entry: hl.rbind(
                    hl.map(
                        lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))),
                        hl.range(0, n_blocks + 1)),
                    lambda step1_separators: hl.rbind(
                        hl.set(step1_separators).contains(step1_idx),
                        hl.sum(
                            hl.map(
                                lambda s1: step1_idx >= s1,
                                step1_separators)) - 1,
                        lambda is_separator, step1_block: entry.annotate(
                            __step1_block=step1_block,
                            __step2_block=hl.cond(~entry.__in_step1 & is_separator,
                                                  step1_block - 1,
                                                  step1_block))))),
            hl.range(0, hl.len(ht.__entries)))))

    mt = ht._unlocalize_entries('__entries', '__cols', col_keys)

    mt_tmp_file2 = new_temp_file()
    mt.write(mt_tmp_file2)
    mt = hl.read_matrix_table(mt_tmp_file2)
    
    # initial coefficient estimates
    mt = mt.annotate_cols(__initial_betas=[
        1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)])
    mt = mt.annotate_cols(__step1_betas=mt.__initial_betas,
                          __step2_betas=mt.__initial_betas)

    # step 1 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step1,
            1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] +
                                               mt.__step1_betas[1] *
                                               mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step1_betas=hl.agg.filter(
            mt.__in_step1,
            hl.agg.linreg(y=mt.__y,
                          x=[1.0, mt.__x],
                          weight=mt.__w).beta))
        mt = mt.annotate_cols(__step1_h2=hl.max(hl.min(
            mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step1_betas=[
            mt.__step1_betas[0],
            mt.__step1_h2 * hl.agg.mean(mt.__n) / M])

    # step 1 block jackknife
    mt = mt.annotate_cols(__step1_block_betas=[
        hl.agg.filter((mt.__step1_block != i) & mt.__in_step1,
                      hl.agg.linreg(y=mt.__y,
                                    x=[1.0, mt.__x],
                                    weight=mt.__w).beta)
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x,
        mt.__step1_block_betas))

    mt = mt.annotate_cols(
        __step1_jackknife_mean=hl.map(
            lambda i: hl.mean(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected)),
            hl.range(0, __k)),
        __step1_jackknife_variance=hl.map(
            lambda i: (hl.sum(
                hl.map(lambda x: x[i]**2,
                       mt.__step1_block_betas_bias_corrected)) -
                       hl.sum(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected))**2 /
                       n_blocks) /
            (n_blocks - 1) / n_blocks,
            hl.range(0, __k)))

    # step 2 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step2,
            1.0/(mt.__w_initial_floor *
                 2.0 * (mt.__step2_betas[0] +
                        mt.__step2_betas[1] *
                        mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            hl.agg.filter(mt.__in_step2,
                          hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                        x=[mt.__x],
                                        weight=mt.__w).beta[0])])
        mt = mt.annotate_cols(__step2_h2=hl.max(hl.min(
            mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            mt.__step2_h2 * hl.agg.mean(mt.__n)/M])

    # step 2 block jackknife
    mt = mt.annotate_cols(__step2_block_betas=[
        hl.agg.filter((mt.__step2_block != i) & mt.__in_step2,
                      hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                    x=[mt.__x],
                                    weight=mt.__w).beta[0])
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x,
        mt.__step2_block_betas))

    mt = mt.annotate_cols(
        __step2_jackknife_mean=hl.mean(
            mt.__step2_block_betas_bias_corrected),
        __step2_jackknife_variance=(
            hl.sum(mt.__step2_block_betas_bias_corrected**2) -
            hl.sum(mt.__step2_block_betas_bias_corrected)**2 /
            n_blocks) / (n_blocks - 1) / n_blocks)

    # combine step 1 and step 2 block jackknifes
    mt = mt.annotate_entries(
        __step2_initial_w=1.0/(mt.__w_initial_floor *
                               2.0 * (mt.__initial_betas[0] +
                                      mt.__initial_betas[1] *
                                      mt.__x_floor)**2))

    mt = mt.annotate_cols(
        __final_betas=[
            mt.__step1_betas[0],
            mt.__step2_betas[1]],
        __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) /
             hl.agg.sum(mt.__step2_initial_w * mt.__x**2)))

    mt = mt.annotate_cols(__final_block_betas=hl.map(
        lambda i: (mt.__step2_block_betas[i] - mt.__c *
                   (mt.__step1_block_betas[i][0] - mt.__final_betas[0])),
        hl.range(0, n_blocks)))

    mt = mt.annotate_cols(
        __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] -
                                            (n_blocks - 1) *
                                            mt.__final_block_betas))

    mt = mt.annotate_cols(
        __final_jackknife_mean=[
            mt.__step1_jackknife_mean[0],
            hl.mean(mt.__final_block_betas_bias_corrected)],
        __final_jackknife_variance=[
            mt.__step1_jackknife_variance[0],
            (hl.sum(mt.__final_block_betas_bias_corrected**2) -
             hl.sum(mt.__final_block_betas_bias_corrected)**2 /
             n_blocks) / (n_blocks - 1) / n_blocks])

    # convert coefficient to heritability estimate
    mt = mt.annotate_cols(
        phenotype=mt.__y_name,
        mean_chi_sq=hl.agg.mean(mt.__y),
        intercept=hl.struct(
            estimate=mt.__final_betas[0],
            standard_error=hl.sqrt(mt.__final_jackknife_variance[0])),
        snp_heritability=hl.struct(
            estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1],
            standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 *
                                   mt.__final_jackknife_variance[1])))

    # format and return results
    ht = mt.cols()
    ht = ht.key_by(ht.phenotype)
    ht = ht.select(ht.mean_chi_sq,
                   ht.intercept,
                   ht.snp_heritability)

    ht_tmp_file = new_temp_file()
    ht.write(ht_tmp_file)
    ht = hl.read_table(ht_tmp_file)
    
    return ht
Esempio n. 14
0
def matrix_table_entries_show(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.entries().show()
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    # save relatedness estimates for pc_relate global populations
    ht = hl.read_table(PC_RELATE_ESTIMATE_GLOBAL)
    related_samples = ht.filter(ht.kin > 0.1)
    pc_relate_global = pd.DataFrame({
        'i_s': related_samples.i.s.collect(),
        'j_s': related_samples.j.s.collect(),
        'kin': related_samples.kin.collect(),
    })
    filename = output_path(f'pc_relate_global_matrix.csv', 'analysis')
    pc_relate_global.to_csv(filename, index=False)

    # get maximal independent set
    pairs = ht.filter(ht['kin'] >= 0.125)
    related_samples_to_remove = hl.maximal_independent_set(
        pairs.i, pairs.j, False)

    related_samples = pd.DataFrame(
        {'removed_individual': related_samples_to_remove.node.s.collect()})
    filename = output_path(f'pc_relate_global_maximal_independent_set.csv',
                           'analysis')
    related_samples.to_csv(filename, index=False)

    # save relatedness estimates for pc_relate NFE samples
    ht = hl.read_table(PC_RELATE_ESTIMATE_NFE)
    related_samples = ht.filter(ht.kin > 0.1)
    pc_relate_nfe = pd.DataFrame({
        'i_s': related_samples.i.s.collect(),
        'j_s': related_samples.j.s.collect(),
        'kin': related_samples.kin.collect(),
    })
    filename = output_path(f'pc_relate_nfe_matrix.csv', 'analysis')
    pc_relate_nfe.to_csv(filename, index=False)
    # get maximal independent set
    pairs = ht.filter(ht['kin'] >= 0.125)
    related_samples_to_remove = hl.maximal_independent_set(
        pairs.i, pairs.j, False)
    related_samples = pd.DataFrame(
        {'removed_individual': related_samples_to_remove.node.s.collect()})
    filename = output_path(f'pc_relate_nfe_maximal_independent_set.csv',
                           'analysis')
    related_samples.to_csv(filename, index=False)

    # save relatedness estimates for KING NFE samples
    mt = hl.read_matrix_table(KING_ESTIMATE_NFE)
    ht = mt.entries()
    # remove entries where samples are identical
    related_samples = ht.filter(ht.s_1 != ht.s)
    related_samples = ht.filter(ht.phi > 0.1)
    king_nfe = pd.DataFrame({
        'i_s': related_samples.s_1.collect(),
        'j_s': related_samples.s.collect(),
        'kin': related_samples.phi.collect(),
    })
    filename = output_path(f'king_nfe_matrix_90k.csv', 'analysis')
    king_nfe.to_csv(filename, index=False)
    # save KING NFE maximal independent set
    second_degree_related_samples = ht.filter(
        (ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True)
    struct = hl.struct(i=second_degree_related_samples.s_1,
                       j=second_degree_related_samples.s)
    struct = struct.annotate(phi=second_degree_related_samples.phi)
    related_samples_to_remove = hl.maximal_independent_set(
        struct.i,
        struct.j,
        False  # pylint: disable=E1101
    )
    related_samples = pd.DataFrame(
        {'related_individual': related_samples_to_remove.node.collect()})
    filename = output_path(
        f'king_90k_related_samples_maximal_independent_set.csv', 'analysis')
    related_samples.to_csv(filename, index=False)
Esempio n. 16
0
ht_aj_samples = hl.import_table(AJ_LIST, no_header=True, key='f0')
ht_pca_samples = hl.import_table(PCA_LIST, no_header=True, key='f0')
ht_ibd_samples = hl.import_table(IBD_SAMPLES, no_header=True, key='f0')

ht_initial_variants = hl.import_table(INITIAL_VARIANT_LIST,
                                      types={
                                          'locus':
                                          hl.tlocus(reference_genome='GRCh38'),
                                          'alleles':
                                          hl.tarray(hl.tstr)
                                      })

ht_initial_variants = ht_initial_variants.key_by(ht_initial_variants.locus,
                                                 ht_initial_variants.alleles)

mt = hl.read_matrix_table(MT)

mt = mt.filter_cols(hl.is_defined(ht_pca_samples[mt.col_key]))
mt = mt.filter_cols(~hl.is_defined(ht_aj_samples[mt.col_key]))
mt = mt.filter_cols(~hl.is_defined(ht_ibd_samples[mt.col_key]))
mt = mt.filter_rows(hl.is_defined(ht_initial_variants[mt.row_key]))

mt = mt.annotate_cols(phenotype=sample_annotations[mt.col_key])
mt = mt.annotate_cols(imputesex=impute_sex_annotations[mt.col_key])

mt = hl.variant_qc(mt, name='qc')

mt = mt.annotate_rows(qc=mt.qc.annotate(
    AC=mt.qc.AC[1], AF=mt.qc.AF[1],
    homozygote_count=mt.qc.homozygote_count[1]))
Esempio n. 17
0
def group_by_collect_per_row(path):
    ht = hl.read_matrix_table(path).localize_entries('e', 'c')
    ht.group_by(*ht.key).aggregate(value=hl.agg.collect(ht.row_value))._force_count()
Esempio n. 18
0
    # Add file handler
    fh = logging.FileHandler(args.log_file)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    root.addHandler(fh)

    # Add streaming handler
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    root.addHandler(ch)

    ########################
    # Read in matrix table #
    ########################
    qcd_mt = hl.read_matrix_table(args.mt)

    ###########################################
    # Validate parent/offspring relationships #
    ###########################################
    validated_fam = validate_pedigree(args.fam, args.kin, args)

    ########################
    # Find denovo variants #
    ########################
    denovo_table = get_denovos(validated_fam, qcd_mt, args)

    denovo_table.write(args.output_stem + "_denovo_variants.ht",
                       overwrite=True)
    denovo_table = denovo_table.flatten()
    denovo_table = denovo_table.drop("vep.input")
if __name__ == "__main__":
    # need to create spark cluster first before intiialising hail
    sc = pyspark.SparkContext()
    # Define the hail persistent storage directory
    tmp_dir = "hdfs://spark-master:9820/"
    temp_dir = "file:///home/ubuntu/data/tmp"
    plot_dir = "/home/ubuntu/data/tmp"
    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38")
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    # read matrixtable = remove the
    # mt = hl.read_matrix_table(
    #    f'{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts_split_multi.mt')
    mt = hl.read_matrix_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_chr1-7and20_split.mt'
    )
    samples_to_remove_filename = f"{temp_dir}/ddd-elgh-ukbb/filtering/samples_failed_QC.tsv"

    samples_to_remove = hl.import_table(samples_to_remove_filename).key_by('s')
    mt_filtered = mt.filter_cols(hl.is_defined(samples_to_remove[mt.s]),
                                 keep=False)
    mt_filtered.write(
        f'{tmp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-7and20_split_sampleqc_filtered.mt'
    )
Esempio n. 20
0
def write_profile_mt(mt_path):
    with TemporaryDirectory() as tmpdir:
        hl.read_matrix_table(mt_path).write(path.join(tmpdir, 'tmp.mt'))
Esempio n. 21
0
def matrix_table_take_col(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.s.take(100)
Esempio n. 22
0
def matrix_table_take_row(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.info.AF.take(100)
Esempio n. 23
0
def matrix_table_rows_force_count(mt_path):
    ht = hl.read_matrix_table(mt_path).rows().key_by()
    ht._force_count()
Esempio n. 24
0
def group_by_take_rekey(path):
    ht = hl.read_matrix_table(path).localize_entries('e', 'c')
    ht.group_by(k=hl.int(ht.row_idx / 50)).aggregate(value=hl.agg.take(ht.row_value, 1))._force_count()
Esempio n. 25
0
def matrix_table_entries_table(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.entries()._force_count()
Esempio n. 26
0
def main(args):
    n_partitions = 500

    # ANNOTATION TABLES:
    truth_data_ht = hl.read_table(args.truthset_table)
    trio_stats_table = hl.read_table(args.trio_stats_table)

    #inbreeding_ht = hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_inbreeding.ht')
    allele_data_ht = hl.read_table(args.allele_data)
    allele_counts_ht = hl.read_table(args.allele_count)
    allele_counts_ht = allele_counts_ht.select(
        *['ac_qc_samples_raw', 'ac_qc_samples_adj'])
    inbreeding_ht = hl.read_table(args.inbreeding)
    group = "raw"

    mt = hl.read_matrix_table(args.matrixtable)
    mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by(
        'locus', 'alleles')
    # mt = mt.select_entries(
    #    GT=hl.unphased_diploid_gt_index_call(mt.GT.n_alt_alleles()))
    # mt = mt.annotate_rows(InbreedingCoeff=hl.or_missing(
    #   ~hl.is_nan(mt.info.InbreedingCoeff), mt.info.InbreedingCoeff))
    ht = mt.rows()
    ht = ht.transmute(**ht.info)
    #ht = ht.select("FS", "MQ", "QD", "InbreedingCoeff", *INFO_FEATURES)

    trio_stats_ht = trio_stats_table.select(f"n_transmitted_{group}",
                                            f"ac_children_{group}")

    ht = ht.annotate(
        **inbreeding_ht[ht.key],
        **trio_stats_ht[ht.key],
        **truth_data_ht[ht.key],
        **allele_data_ht[ht.key].allele_data,
        **allele_counts_ht[ht.key],
    )
    # Filter to only variants found in high quality samples or controls with no LowQual filter
    # ht = ht.filter(
    #    (ht[f"ac_children_{group}"] > 0)
    # )  # TODO: change to AS_lowqual for v3.1 or leave as is to be more consistent with v3.0? I will need to add this annotation if so
    ht = ht.annotate(fail_hard_filters=(ht.QD < 2)
                     | (ht.FS > 60) | (ht.MQ < 30))
    ht = ht.annotate(ac_raw=ht.ac_qc_samples_raw)
    ht = ht.annotate(transmitted_singleton=(ht[f"n_transmitted_{group}"] == 1)
                     & (ht[f"ac_qc_samples_{group}"] == 2))

    # the following only selects the required RF fields but I commented it out because some of the fields excluded are needed later
    # ht = ht.select(
    #    "a_index",
    #    "was_split",
    #    *FEATURES,
    #    *TRUTH_DATA,
    #    **{
    #        "transmitted_singleton": (ht[f"n_transmitted_{group}"] == 1)
    #        & (ht[f"ac_qc_samples_{group}"] == 2),
    #        "fail_hard_filters": (ht.QD < 2) | (ht.FS > 60) | (ht.MQ < 30),
    #    },
    #    ac_raw=ht.ac_qc_samples_raw

    # )

    ht = ht.repartition(n_partitions, shuffle=False)
    ht = ht.checkpoint(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_table_for_RF_all_cols.ht',
        overwrite=True)
    ht = median_impute_features(ht, {"variant_type": ht.variant_type})
    ht = ht.checkpoint(
        f'{args.output_dir}/ddd-elgh-ukbb/Sanger_table_for_RF_by_variant_type_all_cols.ht',
        overwrite=True)
Esempio n. 27
0
from pprint import pprint
hl.plot.output_notebook()

MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.hardcalls.mt'
PCA_SCORES_EUR = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/11_pca_scores.strict_european.tsv'

PHENOTYPES_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/phenotypes.ht'
INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list'
PRUNED_VARIANTS = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/04_prune.keep.variant_list'

IBD_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/06_ibd.remove.sample_list'

# These are the strictly defined European samples.
EUROPEAN_SAMPLES_STRICT = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/10_european.strict.sample_list'

mt = hl.read_matrix_table(MT_HARDCALLS)
sample_annotations = hl.read_table(PHENOTYPES_TABLE)

ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0')
ht_pruned_variants = hl.import_table(PRUNED_VARIANTS, no_header=True)
ht_ibd_samples = hl.import_table(IBD_SAMPLES, no_header=True, key='f0')
ht_eur_samples_strict = hl.import_table(EUROPEAN_SAMPLES_STRICT,
                                        no_header=True,
                                        key='f0')

ht_pruned_variants = ht_pruned_variants.annotate(
    **hl.parse_variant(ht_pruned_variants.f0, reference_genome='GRCh38'))
ht_pruned_variants = ht_pruned_variants.key_by(ht_pruned_variants.locus,
                                               ht_pruned_variants.alleles)

mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key]))
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    bed_to_exclude_pca = hl.import_bed(
        f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38')
    cohorts_pop = hl.import_table(
        "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb.tsv",
        delimiter="\t").key_by('s')

    # drop cohorts
    # annotate with cohorts and populations from s3 table.
    # save matrixtable
    mt = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt"
    )
    mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort)
    mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population)
    mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)
    # done the above on pca_RF jupyter notebook
    # mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts.mt")
    #mt = hl.split_multi_hts(    mt, keep_star=False, left_aligned=False)
    mt.write(
        f"{tmp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_ld_pruned.mt",
        overwrite=True)
    # filter matrixtable
    logger.info("wrote mt ")
    # filter mt
    # mt = hl.read_matrix_table(
Esempio n. 29
0
var_metadata_path = 'gs://gcp-public-data--gnomad/release/3.1.1/ht/genomes/gnomad.genomes.v3.1.1.sites.ht'

# path for Konrad's densified matrix table
dense_mt_path = 'gs://hgdp_tgp/output/tgp_hgdp.mt'

# reading in Alicia's sample metadata file (Note: this file uses the 'v3.1::' prefix as done in gnomAD)
sample_meta = hl.import_table(sample_metadata_path, impute=True)

# reading in Julia's sample metadata file
jul_meta = hl.read_table(jul_metadata_path)

# reading in variant qc information
var_meta = hl.read_table(var_metadata_path)

# reading in densified matrix table
dense_mt = hl.read_matrix_table(dense_mt_path)

# These bits below were written by Tim Poterba to help troubleshoot unflattening a ht with nested structure
# dict to hold struct names as well as nested field names
d = {}

# Getting just the row field names
row = sample_meta.row_value

# returns a dict with the struct names as keys and their inner field names as values
for name in row:

    def recur(dict_ref, split_name):
        if (len(split_name) == 1):
            dict_ref[split_name[0]] = row[name]
            return
Esempio n. 30
0
    # Define the hail persistent storage directory
    tmp_dir = "hdfs://spark-master:9820/"
    temp_dir = os.path.join(os.environ["HAIL_HOME"], "tmp")
    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38")
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    #####################################################################
    ###################### INPUT DATA  ##############################
    #####################################################################
    CHROMOSOME = "WGS"
    mt = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_annotated.mt")
    mt = mt.key_rows_by('locus').distinct_by_row(
    ).key_rows_by('locus', 'alleles')
    mt_split = hl.split_multi_hts(mt, keep_star=False, left_aligned=False)
    mt_split = mt_split.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_checkpoint.mt",  overwrite=True)
    print("Finished splitting and writing mt. ")
    mt = mt_split.annotate_rows(
        Variant_Type=hl.cond((hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP",
                             hl.cond(
            hl.is_insertion(
                mt_split.alleles[0], mt_split.alleles[1]),
            "INDEL",
            hl.cond(hl.is_deletion(mt_split.alleles[0],
                                   mt_split.alleles[1]), "INDEL",
                    "Other"))))
Esempio n. 31
0
import hail as hl

GOLD_STD = 'gs://hail-common/vep/vep/vep_examplars/vep_no_csq_35d9e30.mt/'
GOLD_STD_CSQ = 'gs://hail-common/vep/vep/vep_examplars/vep_csq_23673e70.mt/'

for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]:
    print(f"Checking 'hl.vep' replicates on '{path}'")
    expected = hl.read_matrix_table(path)
    actual = hl.vep(expected.select_rows(), 'gs://hail-common/vep/vep/vep85-loftee-gcloud.json', csq=csq)
    vep_result_agrees = actual._same(expected)
    if vep_result_agrees:
        print('TEST PASSED')
    else:
        print('TEST FAILED')
    assert vep_result_agrees
        **add_variant_type(mt.alleles)))
    mt = hl.split_multi_hts(mt, left_aligned=True)

    allele_type = (hl.case().when(
        hl.is_snp(mt.alleles[0], mt.alleles[1]),
        'snv').when(hl.is_insertion(mt.alleles[0], mt.alleles[1]),
                    'ins').when(hl.is_deletion(mt.alleles[0], mt.alleles[1]),
                                'del').default('complex'))
    mt = mt.annotate_rows(allele_data=mt.allele_data.annotate(
        allele_type=allele_type,
        was_mixed=mt.allele_data.variant_type == 'mixed'))
    return mt


# Read in the matrix table
mt = hl.read_matrix_table('aatd.mt')

# Left normalize and split alleles
split = generate_split_alleles(mt)
mts = hl.variant_qc(split)

# Hard-filtering germline short variants
mts = mts.filter_rows(mts.info.QD >= 2)
mts = mts.filter_rows(mts.info.FS <= 60)
mts = mts.filter_rows(mts.info.SOR <= 3)
mts = mts.filter_rows(mts.info.MQ >= 40)
mts = mts.filter_rows(mts.info.MQRankSum >= -12.5)
mts = mts.filter_rows(mts.info.ReadPosRankSum >= -8)

mts_mq_40 = mts.filter_rows(mts.info.MQ >= 40)
mts_mq_60 = mts.filter_rows(mts.info.MQ >= 60)
Esempio n. 33
0
def matrix_table_aggregate_entries():
    mt = hl.read_matrix_table(resource('profile.mt'))
    mt.aggregate_entries(hl.agg.stats(mt.GQ))
Esempio n. 34
0
def load_dataset(name,
                 version,
                 reference_genome,
                 config_file='gs://hail-datasets/datasets.json'):
    """Load a genetic dataset from Hail's repository.

    Example
    -------

    >>> # Load 1000 Genomes MatrixTable with GRCh38 coordinates
    >>> mt_1kg = hl.experimental.load_dataset(name='1000_genomes',   # doctest: +SKIP
    ...                                       version='phase3',
    ...                                       reference_genome='GRCh38')

    Parameters
    ----------
    name : :obj:`str`
        Name of the dataset to load.
    version : :obj:`str`
        Version of the named dataset to load
        (see available versions in documentation).
    reference_genome : `GRCh37` or `GRCh38`
        Reference genome build.

    Returns
    -------
    :class:`.Table` or :class:`.MatrixTable`"""

    with hl.hadoop_open(config_file, 'r') as f:
        datasets = json.load(f)

    names = set([dataset['name'] for dataset in datasets])
    if name not in names:
        raise ValueError('{} is not a dataset available in the repository.'.format(repr(name)))

    versions = set([dataset['version'] for dataset in datasets if dataset['name']==name])
    if version not in versions:
        raise ValueError("""Version {0} not available for dataset {1}.
                            Available versions: {{{2}}}.""".format(repr(version), 
                                                                   repr(name),
                                                                   repr('","'.join(versions))))

    reference_genomes = set([dataset['reference_genome'] for dataset in datasets if dataset['name']==name])
    if reference_genome not in reference_genomes:
        raise ValueError("""Reference genome build {0} not available for dataset {1}.
                            Available reference genome builds: {{'{2}'}}.""".format(repr(reference_genome),
                                                                                    repr(name), 
                                                                                    '\',\''.join((reference_genomes))))

    path = [dataset['path'] for dataset in datasets if all([dataset['name']==name,
                                                            dataset['version']==version,
                                                            dataset['reference_genome']==reference_genome])][0].strip('/')

    if path.endswith('.ht'):
        dataset = hl.read_table(path)
    else:
        if not path.endswith('.mt'):
            raise ValueError('Invalid path {}: can only load datasets with .ht or .mt extensions.'.format(repr(path)))
        dataset = hl.read_matrix_table(path)

    return dataset
Esempio n. 35
0
def matrix_table_cols_show(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.cols().show(100)
Esempio n. 36
0
def matrix_table_entries_table():
    mt = hl.read_matrix_table(resource('profile.mt'))
    mt.entries()._force_count()
Esempio n. 37
0
def export_vcf():
    mt = hl.read_matrix_table(resource('profile.mt'))
    out = hl.utils.new_temp_file(suffix='vcf.bgz')
    hl.export_vcf(mt, out)
Esempio n. 38
0
def matrix_table_rows_is_transition():
    ht = hl.read_matrix_table(resource('profile.mt')).rows().key_by()
    ht.select(is_snp = hl.is_snp(ht.alleles[0], ht.alleles[1]))._force_count()
Esempio n. 39
0
def matrix_table_show(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.show(100)
Esempio n. 40
0
def matrix_table_many_aggs_col_wise():
    mt = hl.read_matrix_table(resource('profile.mt'))
    mt = mt.annotate_cols(**many_aggs(mt))
    mt.cols()._force_count()
Esempio n. 41
0
def matrix_table_entries_table_no_key(mt_path):
    mt = hl.read_matrix_table(mt_path).key_rows_by().key_cols_by()
    mt.entries()._force_count()
Esempio n. 42
0
def read_decode_gnomad_coverage(mt_path):
    hl.read_matrix_table(mt_path)._force_count_rows()
Esempio n. 43
0
hl.init()

from hail.plot import show
from pprint import pprint
hl.plot.output_notebook()

# The following was run using these jars and zips (to enable the new version of split_multi).
# gs://hail-common/builds/0.2/jars/hail-0.2-7a280b932abc959f7accf11124f3255038f48c95-Spark-2.4.0.jar
# gs://hail-common/builds/0.2/python/hail-0.2-7a280b932abc959f7accf11124f3255038f48c95.zip

RAW_MT = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/dalio_bipolar_w1_w2/Dalio_W1_W2_GRCh38_exomes.mt'
MT = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.mt'
MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.hardcalls.mt'

# Remove multiallelics with 100 or more alleles...
mt = hl.read_matrix_table(RAW_MT)

# Count before splitting multi-allelics.
n = mt.count()

pprint('n samples:')
print(n[1])
pprint('n variants:')
print(n[0])

# Read in the target intervals
TARGET_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/ice_coding_v1_targets.interval_list'
# Import the interval lists for the target intervals.
target_intervals = hl.import_locus_intervals(TARGET_INTERVALS,
                                             reference_genome='GRCh38')
Esempio n. 44
0
def matrix_table_take_entry(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.GT.take(100)
Esempio n. 45
0
def matrix_table_array_arithmetic():
    mt = hl.read_matrix_table(resource('profile.mt'))
    mt = mt.filter_rows(mt.alleles.length() == 2)
    mt.select_entries(dosage = hl.pl_dosage(mt.PL)).select_rows()._force_count_rows()
Esempio n. 46
0
def get_gnomad_data(data_type: str,
                    adj: bool = False,
                    split: bool = True,
                    raw: bool = False,
                    non_refs_only: bool = False,
                    hail_version: str = CURRENT_HAIL_VERSION,
                    meta_version: str = None,
                    meta_root: Optional[str] = 'meta',
                    full_meta: bool = False,
                    fam_version: str = CURRENT_FAM,
                    fam_root: str = None,
                    duplicate_mapping_root: str = None,
                    release_samples: bool = False,
                    release_annotations: bool = None) -> hl.MatrixTable:
    """
    Wrapper function to get gnomAD data as VDS. By default, returns split hardcalls (with adj annotated but not filtered)
    :param str data_type: One of `exomes` or `genomes`
    :param bool adj: Whether the returned data should be filtered to adj genotypes
    :param bool split: Whether the dataset should be split (only applies to raw=False)
    :param bool raw: Whether to return the raw (10T+) data (not recommended: unsplit, and no special consideration on sex chromosomes)
    :param bool non_refs_only: Whether to return the non-ref-genotype only MT (warning: no special consideration on sex chromosomes)
    :param str hail_version: One of the HAIL_VERSIONs
    :param str meta_version: Version of metadata (None for current)
    :param str meta_root: Where to put metadata. Set to None if no metadata is desired.
    :param str full_meta: Whether to add all metadata (warning: large)
    :param str fam_version: Version of metadata (default to current)
    :param str fam_root: Where to put the pedigree information. Set to None if no pedigree information is desired.
    :param str duplicate_mapping_root: Where to put the duplicate genome/exome samples ID mapping (default is None -- do not annotate)
    :param bool release_samples: When set, filters the data to release samples only
    :param str release_annotations: One of the RELEASES to add variant annotations (into va), or None for no data
    :return: gnomAD hardcalls dataset with chosen annotations
    :rtype: MatrixTable
    """
    #from gnomad_hail.utils import filter_to_adj

    if raw and split:
        raise DataException(
            'No split raw data. Use of hardcalls is recommended.')

    if non_refs_only:
        mt = hl.read_matrix_table(
            get_gnomad_data_path(data_type,
                                 split=split,
                                 non_refs_only=non_refs_only,
                                 hail_version=hail_version))
    else:
        mt = hl.read_matrix_table(
            get_gnomad_data_path(data_type,
                                 hardcalls=not raw,
                                 split=split,
                                 hail_version=hail_version))

    if adj:
        mt = filter_to_adj(mt)

    if meta_root:
        meta_ht = get_gnomad_meta(data_type, meta_version, full_meta=full_meta)
        mt = mt.annotate_cols(**{meta_root: meta_ht[mt.s]})

    if duplicate_mapping_root:
        dup_ht = hl.import_table(
            genomes_exomes_duplicate_ids_tsv_path,
            impute=True,
            key='exome_id' if data_type == "exomes" else 'genome_id')
        mt = mt.annotate_cols(**{duplicate_mapping_root: dup_ht[mt.s]})

    if fam_root:
        fam_ht = hl.import_fam(fam_path(data_type, fam_version))
        mt = mt.annotate_cols(**{fam_root: fam_ht[mt.s]})

    if release_samples:
        mt = mt.filter_cols(mt.meta.release)

    if release_annotations:
        sites_mt = get_gnomad_public_data(data_type, split,
                                          release_annotations)
        mt = mt.select_rows(release=sites_mt[
            mt.v, :])  # TODO: replace with ** to nuke old annotations

    return mt
Esempio n. 47
0
def matrix_table_rows_force_count():
    ht = hl.read_matrix_table(resource('profile.mt')).rows().key_by()
    ht._force_count()
Esempio n. 48
0
variant_qc_table_file = 'gs://ccdg-qc-multi/qc_measures/' + chrom + '/5b_variant_qc_table.txt.gz'



#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# define constants
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

mincallrate = 0.98
hwep = 0.000001

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# read data
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

vds = hl.read_matrix_table(qced_vds_file)

samples_to_keep = hl.import_table(samples_to_keep_file, no_header=True).key_by('f0')

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# fix HWE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

step0 = vds.variant_qc.AC > 0
step1 = vds.filters.size() == 0
step2 = vds.info.QD > 4
step3 = vds.lowestcallrate >= mincallrate
step4 = vds.lowestphwe >= hwep


qc_struct = hl.Struct(step0 = step0, step1 = step1, step2 = step2, step3=step3, step4=step4)
Esempio n. 49
0
def matrix_table_decode_and_count():
    mt = hl.read_matrix_table(resource('profile.mt'))
    mt._force_count_rows()
Esempio n. 50
0
MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.hardcalls.mt'
# MT_1KG = 'gs://raw_data_bipolar/data/ALL.1KG.qc.hardcalls.mt' # Old b37 version
MT_1KG = 'gs://hail-datasets-hail-data/1000_Genomes_autosomes.phase_3.GRCh38.mt'
# Check the size of this guy.
# Make sure that the same samples are used in 38....there's less here.

PCA_SCORES_1KG = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/10_pca_scores_1kg.tsv'

PHENOTYPES_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/phenotypes.ht'
# POPULATIONS_1KG = 'gs://raw_data_bipolar/inputs/samples_1kg.tsv'
POPULATIONS_1KG = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/samples_1kg.ht'
INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list'
PRUNED_VARIANTS = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/04_prune.keep.variant_list'
IBD_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/06_ibd.remove.sample_list'

mt_1kg = hl.read_matrix_table(MT_1KG)
mt_1kg = hl.split_multi_hts(mt_1kg)
# This is to enable a join later.
mt_1kg = mt_1kg.select_entries("GT")
# Write this to annotate with later
mt_1kg.cols().write(output=POPULATIONS_1KG, overwrite=True)
# This is also to enable a join later.
mt_1kg = mt_1kg.select_cols()
populations_1kg = hl.read_table(POPULATIONS_1KG)

mt = hl.read_matrix_table(MT_HARDCALLS)
sample_annotations = hl.read_table(PHENOTYPES_TABLE)

ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0')
ht_pruned_variants = hl.import_table(PRUNED_VARIANTS, no_header=True)
import sys
import hail as hl
sys.path.insert(0, '/home/danfengc/unicorn')

from qc.utils import *
from qc.plotting import *
from main.pca import *

######################################################################################################
# This part of the code is to use to convert AJ MatrixTable to PLINK format                          #
######################################################################################################
# convert it to plink format
mt_AJ = hl.read_matrix_table(
    'gs://unicorn-resources/Ashkenazi_Jewish_Samples/Ashkenazi_Jewish_Samples.mt'
)
hl.export_plink(
    mt_AJ,
    output=
    'gs://unicorn-resources/Ashkenazi_Jewish_Samples/Ashkenazi_Jewish_Samples',
    varid=mt_AJ.rsid,
    cm_position=mt_AJ.cm_position)

######################################################################################################
# This part of the code is to convert 1KG PLINK BFILE to hail matrix table                           #
# The source of the 1KG PLINK bfiles:                                                                #
#    `/psych/genetics_data/ripke/references_outdated/hapmap_ref/impute2_ref/1KG_Aug12/               #
#      ALL_1000G_phase1integrated_v3_impute_macGT1/4pops/qc/pop_4pop_mix_SEQ`                        #
# It is a cleaned PLINK BFILE generated by Stephan.                                                  #
######################################################################################################
mt_1kg = hl.import_plink(
    bed=
Esempio n. 52
0
def per_row_stats_star_star(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.annotate_rows(**hl.agg.stats(mt.x))._force_count_rows()