def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{'name': c, 'length': l} for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par] ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) hl.ir.register_reference_genome_functions(name) self._has_sequence = False self._liftovers = set()
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) par_jrep = [interval._jrep for interval in par] jrep = (Env.hail().variant.GenomeReference.apply( name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par_jrep)) self._init_from_java(jrep) self._name = name self._contigs = contigs self._lengths = lengths self._x_contigs = x_contigs self._y_contigs = y_contigs self._mt_contigs = mt_contigs self._par = par super(GenomeReference, self).__init__()
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{'name': c, 'length': l} for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par] self._global_positions = None ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) hl.ir.register_reference_genome_functions(name) self._has_sequence = False self._liftovers = set()
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) par_strings = [ "{}:{}-{}".format(contig, start, end) for (contig, start, end) in par ] jrep = (Env.hail().variant.GenomeReference.apply( name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par_strings)) self._init_from_java(jrep) self._name = name self._contigs = contigs self._lengths = lengths self._x_contigs = x_contigs self._y_contigs = y_contigs self._mt_contigs = mt_contigs self._par = None self._par_tuple = par super(GenomeReference, self).__init__()
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) par_jrep = [interval._jrep for interval in par] jrep = (Env.hail().variant.GenomeReference .apply(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par_jrep)) self._init_from_java(jrep) self._name = name self._contigs = contigs self._lengths = lengths self._x_contigs = x_contigs self._y_contigs = y_contigs self._mt_contigs = mt_contigs self._par = par super(GenomeReference, self).__init__()
def linreg(dataset, ys, x, covariates=[], root='linreg', block_size=16): """Test each variant for association with multiple phenotypes using linear regression. .. warning:: :py:meth:`.linreg` uses the same set of samples for each phenotype, namely the set of samples for which **all** phenotypes and covariates are defined. **Annotations** With the default root, the following four variant annotations are added. The indexing of the array annotations corresponds to that of ``y``. - **va.linreg.nCompleteSamples** (*Int*) -- number of samples used - **va.linreg.AC** (*Double*) -- sum of input values ``x`` - **va.linreg.ytx** (*Array[Double]*) -- array of dot products of each response vector ``y`` with the input vector ``x`` - **va.linreg.beta** (*Array[Double]*) -- array of fit effect coefficients, :math:`\hat\beta_1` - **va.linreg.se** (*Array[Double]*) -- array of estimated standard errors, :math:`\widehat{\mathrm{se}}` - **va.linreg.tstat** (*Array[Double]*) -- array of :math:`t`-statistics, equal to :math:`\hat\beta_1 / \widehat{\mathrm{se}}` - **va.linreg.pval** (*Array[Double]*) -- array of :math:`p`-values :param ys: list of one or more response expressions. :type ys: list of str :param str x: expression for input variable :param covariates: list of covariate expressions. :type covariates: list of str :param str root: Variant annotation path to store result of linear regression. :param int variant_block_size: Number of variant regressions to perform simultaneously. Larger block size requires more memmory. :return: Variant dataset with linear regression variant annotations. :rtype: :py:class:`.VariantDataset` """ all_exprs = [x] ys = wrap_to_list(ys) # x is entry-indexed analyze(x, dataset._entry_indices, set(), set(dataset._fields.keys())) # ys and covariates are col-indexed for e in (tuple(wrap_to_list(ys)) + tuple(covariates)): all_exprs.append(e) analyze(e, dataset._col_indices, set(), set(dataset._fields.keys())) base, cleanup = dataset._process_joins(*all_exprs) jm = base._jvds.linreg( jarray(Env.jvm().java.lang.String, [y._ast.to_hql() for y in ys]), x._ast.to_hql(), jarray(Env.jvm().java.lang.String, [cov._ast.to_hql() for cov in covariates]), 'va.`{}`'.format(root), block_size ) return cleanup(MatrixTable(dataset._hc, jm))
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) par_strings = ["{}:{}-{}".format(contig, start, end) for (contig, start, end) in par] jrep = (Env.hail().variant.ReferenceGenome .apply(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par_strings)) self._init_from_java(jrep) self._name = name self._contigs = contigs self._lengths = lengths self._x_contigs = x_contigs self._y_contigs = y_contigs self._mt_contigs = mt_contigs self._par = None self._par_tuple = par super(ReferenceGenome, self).__init__() ReferenceGenome._references[name] = self
def select(self, column_names): """Select a subset of columns. **Examples** Assume ``kt1`` is a :py:class:`.KeyTable` with three columns: C1, C2 and C3. Select/drop columns: >>> kt_result = kt1.select('C1') Reorder the columns: >>> kt_result = kt1.select(['C3', 'C1', 'C2']) Drop all columns: >>> kt_result = kt1.select([]) :param column_names: List of columns to be selected. :type: str or list of str :return: Key table with selected columns. :rtype: :class:`.KeyTable` """ column_names = wrap_to_list(column_names) new_key = [k for k in self.key if k in column_names] return KeyTable(self.hc, self._jkt.select(column_names, new_key))
def key_by(self, key): """Change which columns are keys. **Examples** Assume ``kt`` is a :py:class:`.KeyTable` with three columns: c1, c2 and c3 and key c1. Change key columns: >>> kt_result = kt1.key_by(['C2', 'C3']) >>> kt_result = kt1.key_by('C2') Set to no keys: >>> kt_result = kt1.key_by([]) :param key: List of columns to be used as keys. :type key: str or list of str :return: Key table whose key columns are given by ``key``. :rtype: :class:`.KeyTable` """ return KeyTable(self.hc, self._jkt.keyBy(wrap_to_list(key)))
def key_by(self, key): """Change which columns are keys. **Examples** Assume ``kt`` is a :py:class:`.KeyTable` with three columns: c1, c2 and c3 and key c1. Change key columns: >>> kt_result = kt1.key_by(['C2', 'C3']) >>> kt_result = kt1.key_by('C2') Set to no keys: >>> kt_result = kt1.key_by([]) :param key: List of columns to be used as keys. :type key: str or list of str :return: Key table whose key columns are given by ``key``. :rtype: :class:`.KeyTable` """ if isinstance(key, list): for k in key: if not isinstance(k, str) and not isinstance(k, unicode): raise TypeError("expected str or unicode elements of 'key' list, but found %s" % type(k)) elif not isinstance(key, str) and not isinstance(key, unicode): raise TypeError("expected str or list of str for parameter 'key', but found %s" % type(key)) return KeyTable(self.hc, self._jkt.keyBy(wrap_to_list(key)))
def from_dataframe(df, key=[]): """Convert Spark SQL DataFrame to key table. Spark SQL data types are converted to Hail types as follows: .. code-block:: text BooleanType => Boolean IntegerType => Int LongType => Long FloatType => Float DoubleType => Double StringType => String BinaryType => Binary ArrayType => Array StructType => Struct Unlisted Spark SQL data types are currently unsupported. :param df: PySpark DataFrame. :type df: ``DataFrame`` :param key: Key column(s). :type key: str or list of str :return: Key table constructed from the Spark SQL DataFrame. :rtype: :class:`.KeyTable` """ return KeyTable( Env.hc(), Env.hail().keytable.KeyTable.fromDF(Env.hc()._jhc, df._jdf, wrap_to_list(key)))
def explode(self, column_names): """Explode columns of this key table. The explode operation unpacks the elements in a column of type ``Array`` or ``Set`` into its own row. If an empty ``Array`` or ``Set`` is exploded, the entire row is removed from the :py:class:`.KeyTable`. **Examples** Assume ``kt3`` is a :py:class:`.KeyTable` with three columns: c1, c2 and c3. >>> kt3 = hc.import_table('data/kt_example3.tsv', impute=True, ... types={'c1': TString(), 'c2': TArray(TInt32()), 'c3': TArray(TArray(TInt32()))}) The types of each column are ``String``, ``Array[Int]``, and ``Array[Array[Int]]`` respectively. c1 cannot be exploded because its type is not an ``Array`` or ``Set``. c2 can only be exploded once because the type of c2 after the first explode operation is ``Int``. +----+----------+----------------+ | c1 | c2 | c3 | +====+==========+================+ | a | [1,2,NA] |[[3,4], []] | +----+----------+----------------+ Explode c2: >>> kt3.explode('c2') +----+-------+-----------------+ | c1 | c2 | c3 | +====+=======+=================+ | a | 1 | [[3,4], []] | +----+-------+-----------------+ | a | 2 | [[3,4], []] | +----+-------+-----------------+ Explode c2 once and c3 twice: >>> kt3.explode(['c2', 'c3', 'c3']) +----+-------+-------------+ | c1 | c2 | c3 | +====+=======+=============+ | a | 1 |3 | +----+-------+-------------+ | a | 2 |3 | +----+-------+-------------+ | a | 1 |4 | +----+-------+-------------+ | a | 2 |4 | +----+-------+-------------+ :param column_names: Column name(s) to be exploded. :type column_names: str or list of str :return: Key table with columns exploded. :rtype: :class:`.KeyTable` """ return KeyTable(self.hc, self._jkt.explode(wrap_to_list(column_names)))
def spread(ht, field, value, key=None) -> Table: """Spread a key-value pair of fields across multiple fields. :func:`.spread` mimics the functionality of the `spread()` function in R's `tidyr` package. This is a way to turn "long" format data into "wide" format data. Given a ``field``, :func:`.spread` will create a new table by grouping ``ht`` by its row key and, optionally, any additional fields passed to the ``key`` argument. After collapsing ``ht`` by these keys, :func:`.spread` creates a new row field for each unique value of ``field``, where the row field values are given by the corresponding ``value`` in the original ``ht``. Parameters ---------- ht : :class:`.Table` A Hail table. field : :obj:`str` The name of the factor field in `ht`. value : :obj:`str` The name of the value field in `ht`. key : optional, obj:`str` or list of :obj:`str` The name of any fields to group by, in addition to the row key fields of ``ht``. Returns ------- :class:`.Table` Table with original ``key`` and ``value`` fields spread across multiple columns.""" if key is None: key = list(ht.key) else: key = wrap_to_list(key) key = list(ht.key) + key field_vals = list(ht.aggregate(hl.agg.collect_as_set(ht[field]))) ht = (ht.group_by(*key).aggregate( **{ rv: hl.agg.take(ht[rv], 1)[0] for rv in ht.row_value if rv not in set(key + [field, value]) }, **{ fv: hl.agg.filter( ht[field] == fv, hl.rbind( hl.agg.take(ht[value], 1), lambda take: hl.cond(hl.len(take) > 0, take[0], 'NA'))) for fv in field_vals })) ht_tmp = new_temp_file() ht.write(ht_tmp) return ht
def parallelize(cls, rows, schema, key=[], num_partitions=None): """Construct a key table from a list of rows. **Examples** >>> rows = [{'a': 5, 'b': 'foo', 'c': False}, ... {'a': None, 'b': 'bar', 'c': True}, ... {'b': 'baz', 'c': False}] >>> schema = TStruct(['a', 'b', 'c'], [TInt32(), TString(), TBoolean()]) >>> table = KeyTable.parallelize(rows, schema, key='b') This table will look like: .. code-block:: text >>> table.to_dataframe().show() +----+---+-----+ | a| b| c| +----+---+-----+ | 5|foo|false| |null|bar| true| |null|baz|false| +----+---+-----+ :param rows: List of rows to include in table. :type rows: list of :class:`.hail.representation.Struct` or dict :param schema: Struct schema of table. :type schema: :class:`.hail.expr.TStruct` :param key: Key field(s). :type key: str or list of str :param num_partitions: Number of partitions to generate. :type num_partitions: int or None :return: Key table parallelized from the given rows. :rtype: :class:`.KeyTable` """ return KeyTable( Env.hc(), Env.hail().keytable.KeyTable.parallelize( Env.hc()._jhc, [schema._convert_to_j(r) for r in rows], schema._jtype, wrap_to_list(key), joption(num_partitions)))
def drop(self, column_names): """Drop columns. **Examples** Assume ``kt1`` is a :py:class:`.KeyTable` with three columns: C1, C2 and C3. Drop columns: >>> kt_result = kt1.drop('C1') >>> kt_result = kt1.drop(['C1', 'C2']) :param column_names: List of columns to be dropped. :type: str or list of str :return: Key table with dropped columns. :rtype: :class:`.KeyTable` """ return KeyTable(self.hc, self._jkt.drop(wrap_to_list(column_names)))
def import_table(self, paths, key=[], min_partitions=None, impute=False, no_header=False, comment=None, delimiter="\t", missing="NA", types={}, quote=None, reference_genome=None): """Import delimited text file (text table) as key table. The resulting key table will have no key columns, use :py:meth:`.KeyTable.key_by` to specify keys. **Example** Given this file .. code-block:: text $ cat data/samples1.tsv Sample Height Status Age PT-1234 154.1 ADHD 24 PT-1236 160.9 Control 19 PT-1238 NA ADHD 89 PT-1239 170.3 Control 55 The interesting thing about this table is that column ``Height`` is a floating-point number, and column ``Age`` is an integer. We can either provide have Hail impute these types from the file, or pass them ourselves: Pass the types ourselves: >>> table = hc1.import_table('data/samples1.tsv', types={'Height': TFloat64(), 'Age': TInt32()}) Note that string columns like ``Sample`` and ``Status`` do not need to be typed, because ``String`` is the default type. Use type imputation (a bit easier, but requires reading the file twice): >>> table = hc1.import_table('data/samples1.tsv', impute=True) **Detailed examples** Let's import annotations from a CSV file with missing data and special characters: .. code-block:: text $ cat data/samples2.tsv Batch,PT-ID 1kg,PT-0001 1kg,PT-0002 study1,PT-0003 study3,PT-0003 .,PT-0004 1kg,PT-0005 .,PT-0006 1kg,PT-0007 In this case, we should: - Pass the non-default delimiter ``,`` - Pass the non-default missing value ``.`` >>> table = hc1.import_table('data/samples2.tsv', delimiter=',', missing='.') Let's import annotations from a file with no header and sample IDs that need to be transformed. Suppose the vds sample IDs are of the form ``NA#####``. This file has no header line, and the sample ID is hidden in a field with other information. .. code-block: text $ cat data/samples3.tsv 1kg_NA12345 female 1kg_NA12346 male 1kg_NA12348 female pgc_NA23415 male pgc_NA23418 male To import: >>> annotations = (hc1.import_table('data/samples3.tsv', no_header=True) ... .annotate('sample = f0.split("_")[1]') ... .key_by('sample')) **Notes** The ``impute`` option tells Hail to scan the file an extra time to gather information about possible field types. While this is a bit slower for large files, (the file is parsed twice), the convenience is often worth this cost. The ``delimiter`` parameter is either a delimiter character (if a single character) or a field separator regex (2 or more characters). This regex follows the `Java regex standard <http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html>`_. .. note:: Use ``delimiter='\\s+'`` to specify whitespace delimited files. The ``comment`` is an optional parameter which causes Hail to skip any line that starts in the given pattern. Passing ``comment='#'`` will skip any line beginning in a pound sign, for example. The ``missing`` parameter defines the representation of missing data in the table. .. note:: The ``comment`` and ``missing`` parameters are **NOT** regexes. The ``no_header`` option indicates that the file has no header line. If this option is passed, then the column names will be ``f0``, ``f1``, ... ``fN`` (0-indexed). The ``types`` option allows the user to pass the types of columns in the table. This is a dict keyed by ``str``, with :py:class:`~hail.expr.Type` values. See the examples above for a standard usage. Additionally, this option can be used to override type imputation. For example, if a column in a file refers to chromosome and does not contain any sex chromosomes, it will be imputed as an integer, while most Hail methods expect chromosome to be passed as a string. Using the ``impute=True`` mode and passing ``types={'Chromosome': TString()}`` will solve this problem. The ``min_partitions`` option can be used to increase the number of partitions (level of sharding) of an imported table. The default partition size depends on file system and a number of other factors (including the ``min_block_size`` of the hail context), but usually is between 32M and 128M. :param paths: Files to import. :type paths: str or list of str :param key: Key column(s). :type key: str or list of str :param min_partitions: Minimum number of partitions. :type min_partitions: int or None :param bool no_header: File has no header and the N columns are named ``f0``, ``f1``, ... ``fN`` (0-indexed) :param bool impute: Impute column types from the file :param comment: Skip lines beginning with the given pattern :type comment: str or None :param str delimiter: Field delimiter regex :param str missing: Specify identifier to be treated as missing :param types: Define types of fields in annotations files :type types: dict with str keys and :py:class:`.Type` values :param quote: Quote character :type quote: str or None :param reference_genome: Reference genome to use when imputing Variant or Locus columns. Default is :class:`~.HailContext.default_reference`. :type reference_genome: :class:`.GenomeReference` :return: Key table constructed from text table. :rtype: :class:`.KeyTable` """ key = wrap_to_list(key) paths = wrap_to_list(paths) jtypes = {k: v._jtype for k, v in types.items()} rg = reference_genome if reference_genome else self.default_reference jkt = self._jhc.importTable(paths, key, min_partitions, jtypes, comment, delimiter, missing, no_header, impute, quote, rg._jrep) return KeyTable(self, jkt)
def ld_score(entry_expr, annotation_exprs, position_expr, window_size) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(univariate_annotation=1, ... binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Annotate MatrixTable with alt allele count stats >>> mt = mt.annotate_rows(stats=hl.agg.stats(mt.GT.n_alt_alleles())) >>> # Create standardized genotype entry >>> mt = mt.annotate_entries(GT_std=hl.or_else( ... (mt.GT.n_alt_alleles() - mt.stats.mean)/mt.stats.stdev, 0.0)) >>> # Calculate LD scores using standardized genotypes >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT_std, ... annotation_exprs=[ ... mt.univariate_annotation, ... mt.binary_annotation, ... mt.continuous_annotation], ... position_expr=mt.cm_position, ... window_size=1) Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` Annotation expression(s) to partition LD scores. position_expr : :class:`.NumericExpression` Expression for position of variant (e.g. ``mt.cm_position`` or ``mt.locus.position``). window_size : :obj:`int` or :obj:`float` Size of variant window used to calculate LD scores, in units of ``position``. Returns ------- :class:`.Table` Locus-keyed table with LD scores for each variant and annotation.""" assert window_size >= 0 mt = entry_expr._indices.source annotations = wrap_to_list(annotation_exprs) variant_key = [x for x in mt.row_key] ht_annotations = mt.select_rows(*annotations).rows() annotation_names = [x for x in ht_annotations.row if x not in variant_key] ht_annotations = hl.Table.union(*[(ht_annotations.annotate( annotation=hl.str(x), value=hl.float(ht_annotations[x])).select( 'annotation', 'value')) for x in annotation_names]) mt_annotations = ht_annotations.to_matrix_table(row_key=variant_key, col_key=['annotation']) cols = mt_annotations['annotation'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} G = BlockMatrix.from_entry_expr(entry_expr) A = BlockMatrix.from_entry_expr(mt_annotations.value) n = G.n_cols R2 = ((G @ G.T) / n)**2 R2_adj = R2 - (1.0 - R2) / (n - 2.0) positions = [(x[0], float(x[1])) for x in hl.array([mt.locus.contig, hl.str(position_expr)]).collect()] n_positions = len(positions) starts = np.zeros(n_positions, dtype='int') stops = np.zeros(n_positions, dtype='int') contig = '0' for i, (c, p) in enumerate(positions): if c != contig: j = i k = i contig = c min_val = p - window_size max_val = p + window_size while j < n_positions and positions[j][1] < min_val: j += 1 starts[i] = j if k == n_positions: stops[i] = k continue while positions[k][0] == contig and positions[k][1] <= max_val: k += 1 if k == n_positions: break stops[i] = k R2_adj_sparse = R2_adj.sparsify_row_intervals([int(x) for x in starts], [int(x) for x in stops]) L2 = R2_adj_sparse @ A tmp_bm_path = new_temp_file() tmp_tsv_path = new_temp_file() L2.write(tmp_bm_path, force_row_major=True) BlockMatrix.export(tmp_bm_path, tmp_tsv_path) ht_scores = hl.import_table(tmp_tsv_path, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename( {'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht_variants = mt.rows() ht_variants = ht_variants.drop( *[x for x in ht_variants.row if x not in variant_key]) ht_variants = ht_variants.add_index() ht_variants = ht_variants.key_by('idx') ht_scores = ht_variants.join(ht_scores, how='inner') ht_scores = ht_scores.key_by('locus') ht_scores = ht_scores.drop('alleles', 'idx') return ht_scores
def linreg(y, x, nested_dim=1, weight=None) -> StructExpression: """Compute multivariate linear regression statistics. Examples -------- Regress HT against an intercept (1), SEX, and C1: >>> table1.aggregate(agg.linreg(table1.HT, [1, table1.SEX == 'F', table1.C1])) Struct(beta=[88.50000000000014, 81.50000000000057, -10.000000000000068], standard_error=[14.430869689661844, 59.70552738231206, 7.000000000000016], t_stat=[6.132686518775844, 1.365032746099571, -1.428571428571435], p_value=[0.10290201427537926, 0.40250974549499974, 0.3888002244284281], multiple_standard_error=4.949747468305833, multiple_r_squared=0.7175792507204611, adjusted_r_squared=0.1527377521613834, f_stat=1.2704081632653061, multiple_p_value=0.5314327326007864, n=4) Regress blood pressure against an intercept (1), genotype, age, and the interaction of genotype and age: >>> ds_ann = ds.annotate_rows(linreg = ... hl.agg.linreg(ds.pheno.blood_pressure, ... [1, ... ds.GT.n_alt_alleles(), ... ds.pheno.age, ... ds.GT.n_alt_alleles() * ds.pheno.age])) Warning ------- As in the example, the intercept covariate ``1`` must be included **explicitly** if desired. Notes ----- In relation to `lm.summary <https://stat.ethz.ch/R-manual/R-devel/library/stats/html/summary.lm.html>`__ in R, ``linreg(y, x = [1, mt.x1, mt.x2])`` computes ``summary(lm(y ~ x1 + x2))`` and ``linreg(y, x = [mt.x1, mt.x2], nested_dim=0)`` computes ``summary(lm(y ~ x1 + x2 - 1))``. More generally, `nested_dim` defines the number of effects to fit in the nested (null) model, with the effects on the remaining covariates fixed to zero. The returned struct has ten fields: - `beta` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated regression coefficient for each covariate. - `standard_error` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated standard error for each covariate. - `t_stat` (:class:`.tarray` of :py:data:`.tfloat64`): t-statistic for each covariate. - `p_value` (:class:`.tarray` of :py:data:`.tfloat64`): p-value for each covariate. - `multiple_standard_error` (:py:data:`.tfloat64`): Estimated standard deviation of the random error. - `multiple_r_squared` (:py:data:`.tfloat64`): Coefficient of determination for nested models. - `adjusted_r_squared` (:py:data:`.tfloat64`): Adjusted `multiple_r_squared` taking into account degrees of freedom. - `f_stat` (:py:data:`.tfloat64`): F-statistic for nested models. - `multiple_p_value` (:py:data:`.tfloat64`): p-value for the `F-test <https://en.wikipedia.org/wiki/F-test#Regression_problems>`__ of nested models. - `n` (:py:data:`.tint64`): Number of samples included in the regression. A sample is included if and only if `y`, all elements of `x`, and `weight` (if set) are non-missing. All but the last field are missing if `n` is less than or equal to the number of covariates or if the covariates are linearly dependent. If set, the `weight` parameter generalizes the model to `weighted least squares <https://en.wikipedia.org/wiki/Weighted_least_squares>`__, useful for heteroscedastic (diagonal but non-constant) variance. Warning ------- If any weight is negative, the resulting statistics will be ``nan``. Parameters ---------- y : :class:`.Float64Expression` Response (dependent variable). x : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` Covariates (independent variables). nested_dim : :obj:`int` The null model includes the first `nested_dim` covariates. Must be between 0 and `k` (the length of `x`). weight : :class:`.Float64Expression`, optional Non-negative weight for weighted least squares. Returns ------- :class:`.StructExpression` Struct of regression results. """ x = wrap_to_list(x) if len(x) == 0: raise ValueError("linreg: must have at least one covariate in `x`") hl.methods.statgen._warn_if_no_intercept('linreg', x) if weight is None: return _linreg(y, x, nested_dim) else: return _linreg(hl.sqrt(weight) * y, [hl.sqrt(weight) * xi for xi in x], nested_dim)
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def linreg(y, x, nested_dim=1, weight=None) -> StructExpression: """Compute multivariate linear regression statistics. Examples -------- Regress HT against an intercept (1), SEX, and C1: >>> table1.aggregate(agg.linreg(table1.HT, [1, table1.SEX == 'F', table1.C1])) Struct(beta=[88.50000000000014, 81.50000000000057, -10.000000000000068], standard_error=[14.430869689661844, 59.70552738231206, 7.000000000000016], t_stat=[6.132686518775844, 1.365032746099571, -1.428571428571435], p_value=[0.10290201427537926, 0.40250974549499974, 0.3888002244284281], multiple_standard_error=4.949747468305833, multiple_r_squared=0.7175792507204611, adjusted_r_squared=0.1527377521613834, f_stat=1.2704081632653061, multiple_p_value=0.5314327326007864, n=4) Regress blood pressure against an intercept (1), genotype, age, and the interaction of genotype and age: >>> ds_ann = ds.annotate_rows(linreg = ... hl.agg.linreg(ds.pheno.blood_pressure, ... [1, ... ds.GT.n_alt_alleles(), ... ds.pheno.age, ... ds.GT.n_alt_alleles() * ds.pheno.age])) Warning ------- As in the example, the intercept covariate ``1`` must be included **explicitly** if desired. Notes ----- In relation to `lm.summary <https://stat.ethz.ch/R-manual/R-devel/library/stats/html/summary.lm.html>`__ in R, ``linreg(y, x = [1, mt.x1, mt.x2])`` computes ``summary(lm(y ~ x1 + x2))`` and ``linreg(y, x = [mt.x1, mt.x2], nested_dim=0)`` computes ``summary(lm(y ~ x1 + x2 - 1))``. More generally, `nested_dim` defines the number of effects to fit in the nested (null) model, with the effects on the remaining covariates fixed to zero. The returned struct has ten fields: - `beta` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated regression coefficient for each covariate. - `standard_error` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated standard error for each covariate. - `t_stat` (:class:`.tarray` of :py:data:`.tfloat64`): t-statistic for each covariate. - `p_value` (:class:`.tarray` of :py:data:`.tfloat64`): p-value for each covariate. - `multiple_standard_error` (:py:data:`.tfloat64`): Estimated standard deviation of the random error. - `multiple_r_squared` (:py:data:`.tfloat64`): Coefficient of determination for nested models. - `adjusted_r_squared` (:py:data:`.tfloat64`): Adjusted `multiple_r_squared` taking into account degrees of freedom. - `f_stat` (:py:data:`.tfloat64`): F-statistic for nested models. - `multiple_p_value` (:py:data:`.tfloat64`): p-value for the `F-test <https://en.wikipedia.org/wiki/F-test#Regression_problems>`__ of nested models. - `n` (:py:data:`.tint64`): Number of samples included in the regression. A sample is included if and only if `y`, all elements of `x`, and `weight` (if set) are non-missing. All but the last field are missing if `n` is less than or equal to the number of covariates or if the covariates are linearly dependent. If set, the `weight` parameter generalizes the model to `weighted least squares <https://en.wikipedia.org/wiki/Weighted_least_squares>`__, useful for heteroscedastic (diagonal but non-constant) variance. Warning ------- If any weight is negative, the resulting statistics will be ``nan``. Parameters ---------- y : :class:`.Float64Expression` Response (dependent variable). x : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` Covariates (independent variables). nested_dim : :obj:`int` The null model includes the first `nested_dim` covariates. Must be between 0 and `k` (the length of `x`). weight : :class:`.Float64Expression`, optional Non-negative weight for weighted least squares. Returns ------- :class:`.StructExpression` Struct of regression results. """ x = wrap_to_list(x) if len(x) == 0: raise ValueError("linreg: must have at least one covariate in `x`") hl.methods.statgen._warn_if_no_intercept('linreg', x) if weight is None: return _linreg(y, x, nested_dim) else: return _linreg( hl.sqrt(weight) * y, [hl.sqrt(weight) * xi for xi in x], nested_dim)
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def parallelize(cls, rows, schema, key=[], num_partitions=None): return Table( Env.hc(), Env.hail().keytable.KeyTable.parallelize( Env.hc()._jhc, [schema._convert_to_j(r) for r in rows], schema._jtype, wrap_to_list(key), joption(num_partitions)))
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def import_gtf(path, key=None): """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will include the following row fields: .. code-block:: text 'seqname': str 'source': str 'feature': str 'start': int32 'end': int32 'score': float64 'strand': str 'frame': int32 There will also be corresponding fields for every tag found in the attribute field of the GTF file. .. note:: The "end" field in the table will be incremented by 1 in comparison to the value found in the GTF file, as the end coordinate in a GTF file is inclusive while the end coordinate in Hail is exclusive. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', key='gene_id') >>> ht.describe() .. code-block:: text ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'seqname': str 'source': str 'feature': str 'start': int32 'end': int32 'score': float64 'strand': str 'frame': int32 'havana_gene': str 'exon_id': str 'havana_transcript': str 'transcript_name': str 'gene_type': str 'tag': str 'transcript_status': str 'exon_number': str 'level': str 'transcript_id': str 'transcript_type': str 'gene_id': str 'gene_name': str 'gene_status': str ---------------------------------------- Key: ['gene_id'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. key : :obj:`str` or :obj:`list` of :obj:`str` Key field(s). Can be tag name(s) found in the attribute field of the GTF file. Returns ------- :class:`.Table` """ ht = hl.import_table(path, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(end=ht['end'] + 1) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = list(ht.aggregate( hl.set(hl.flatten(hl.agg.collect(ht['attribute'].keys()))))) ht = ht.annotate(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes}) ht = ht.drop(ht['attribute']) if key: key = wrap_to_list(key) ht = ht.key_by(*key) return ht
def linreg(y, x): """Compute linear regression statistics. Examples -------- Regress HT against an intercept (1) , SEX, and C1: >>> table1.aggregate(agg.linreg(table1.HT, [1, table1.SEX == 'F', table1.C1])) Struct( beta=[88.50000000000014, 81.50000000000057, -10.000000000000068], standard_error=[14.430869689661844, 59.70552738231206, 7.000000000000016], t_stat=[6.132686518775844, 1.365032746099571, -1.428571428571435], p_value=[0.10290201427537926, 0.40250974549499974, 0.3888002244284281], n=4) Regress blood pressure against an intercept (1), age, height, and height squared: >>> ds_ann = ds.annotate_rows( ... linreg = hl.agg.linreg(ds.pheno.blood_pressure, ... [1, ds.pheno.age, ds.pheno.height, ds.pheno.height ** 2])) Notes ----- This aggregator returns a struct expression with five fields: - `beta` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated regression coefficient for each predictor. - `standard_error` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated standard error estimate for each predictor. - `t_stat` (:class:`.tarray` of :py:data:`.tfloat64`): t statistic for each predictor. - `p_value` (:class:`.tarray` of :py:data:`.tfloat64`): p-value for each predictor. - `n` (:py:data:`.tint64`): Number of samples included in the regression. A sample is included if and only if `y` and all elements of `x` are non-missing. The first four fields are missing if n is less than or equal to the number of predictors or if the predictors are linearly dependent. Parameters ---------- y : :class:`.Float64Expression` Response variable. x : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` Independent variables. Returns ------- :class:`.StructExpression` Struct with fields `beta`, `standard_error`, `t_stat`, `p_value`, and `n`. """ x = wrap_to_list(x) k = len(x) if k == 0: raise ValueError("'linreg' requires at least one predictor in `x`") t = hl.tstruct(beta=hl.tarray(hl.tfloat64), standard_error=hl.tarray(hl.tfloat64), t_stat=hl.tarray(hl.tfloat64), p_value=hl.tarray(hl.tfloat64), n=hl.tint64) x = hl.array(x) k = hl.int32(k) return _agg_func('LinearRegression', y, t, [k], f=lambda expr: x)
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = ld_score_all_phenos_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={ '__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0) }, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={ '__y': chi_sq_exprs[0], '__n': n_samples_exprs[0] }) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows( hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict( **{ '__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr }, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, ** {n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ ds.select( **{ '__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]]) }) for i, y in enumerate(ys) ] mts = [ ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=[ '__w_initial', '__w_initial_floor', '__x', '__x_floor' ]) for ht in hts ] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows( hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg(lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[ i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl .rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map(lambda s1: step1_idx >= s1, step1_separators )) - 1, lambda is_separator, step1_block: entry.annotate(__step1_block=step1_block, __step2_block=hl.cond( ~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x) ]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max( hl.min(mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M ]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter( (mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta), hl.range(n_blocks))) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected )) - hl.sum( hl.map(lambda x: x[i], mt. __step1_block_betas_bias_corrected))** 2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + +mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter( mt.__in_step2, hl.agg.linreg( y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) ]) mt = mt.annotate_cols(__step2_h2=hl.max( hl.min(mt.__step2_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n) / M ]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]), hl.range(n_blocks))) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean(mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0 / (mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + +mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols(__final_block_betas_bias_corrected=( n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected) ], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks ]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct(estimate=mt.__final_betas[0], standard_error=hl.sqrt( mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M / hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M / hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def linreg(dataset, ys, x, covariates=[], root='linreg', block_size=16): """For each row, test a derived input variable for association with response variables using linear regression. Examples -------- >>> dataset_result = methods.linreg(dataset, [dataset.pheno.height], dataset.GT.num_alt_alleles(), ... covariates=[dataset.pheno.age, dataset.pheno.isFemale]) Warning ------- :meth:`linreg` considers the same set of columns (i.e., samples, points) for every response variable and row, namely those columns for which **all** response variables and covariates are defined. For each row, missing values of ``x`` are mean-imputed over these columns. Notes ----- With the default root, the following row-indexed fields are added. The indexing of the array fields corresponds to that of ``ys``. - **linreg.nCompleteSamples** (*Int32*) -- number of columns used - **linreg.AC** (*Float64*) -- sum of input values ``x`` - **linreg.ytx** (*Array[Float64]*) -- array of dot products of each response vector ``y`` with the input vector ``x`` - **linreg.beta** (*Array[Float64]*) -- array of fit effect coefficients of ``x``, :math:`\hat\\beta_1` below - **linreg.se** (*Array[Float64]*) -- array of estimated standard errors, :math:`\widehat{\mathrm{se}}_1` - **linreg.tstat** (*Array[Float64]*) -- array of :math:`t`-statistics, equal to :math:`\hat\\beta_1 / \widehat{\mathrm{se}}_1` - **linreg.pval** (*Array[Float64]*) -- array of :math:`p`-values In the statistical genetics example above, the input variable ``x`` encodes genotype as the number of alternate alleles (0, 1, or 2). For each variant (row), genotype is tested for association with height controlling for age and sex, by fitting the linear regression model: .. math:: \mathrm{height} = \\beta_0 + \\beta_1 \, \mathrm{genotype} + \\beta_2 \, \mathrm{age} + \\beta_3 \, \mathrm{isFemale} + \\varepsilon, \quad \\varepsilon \sim \mathrm{N}(0, \sigma^2) Boolean covariates like :math:`\mathrm{isFemale}` are encoded as 1 for true and 0 for false. The null model sets :math:`\\beta_1 = 0`. The standard least-squares linear regression model is derived in Section 3.2 of `The Elements of Statistical Learning, 2nd Edition <http://statweb.stanford.edu/~tibs/ElemStatLearn/printings/ESLII_print10.pdf>`__. See equation 3.12 for the t-statistic which follows the t-distribution with :math:`n - k - 2` degrees of freedom, under the null hypothesis of no effect, with :math:`n` samples and :math:`k` covariates in addition to ``x`` and the intercept. Parameters ---------- ys : :obj:`list` of :class:`hail.expr.expression.Expression` One or more response expressions. x : :class:`hail.expr.expression.Expression` Input variable. covariates : :obj:`list` of :class:`hail.expr.expression.Expression` Covariate expressions. root : :obj:`str` Name of resulting row-indexed field. block_size : :obj:`int` Number of row regressions to perform simultaneously per core. Larger blocks require more memory but may improve performance. Returns ------- :class:`.MatrixTable` Dataset with regression results in a new row-indexed field. """ all_exprs = [x] ys = wrap_to_list(ys) # x is entry-indexed analyze('linreg/x', x, dataset._entry_indices) # ys and covariates are col-indexed ys = wrap_to_list(ys) for e in ys: all_exprs.append(e) analyze('linreg/ys', e, dataset._col_indices) for e in covariates: all_exprs.append(e) analyze('linreg/covariates', e, dataset._col_indices) base, cleanup = dataset._process_joins(*all_exprs) jm = base._jvds.linreg( jarray(Env.jvm().java.lang.String, [y._ast.to_hql() for y in ys]), x._ast.to_hql(), jarray(Env.jvm().java.lang.String, [cov._ast.to_hql() for cov in covariates]), 'va.`{}`'.format(root), block_size) return cleanup(MatrixTable(jm))