def set_female_y_metrics_to_na_expr( t: Union[hl.Table, hl.MatrixTable]) -> hl.expr.ArrayExpression: """ Set Y-variant frequency callstats for female-specific metrics to missing structs. .. note:: Requires freq, freq_meta, and freq_index_dict annotations to be present in Table or MatrixTable :param t: Table or MatrixTable for which to adjust female metrics :return: Hail array expression to set female Y-variant metrics to missing values """ female_idx = hl.map( lambda x: t.freq_index_dict[x], hl.filter(lambda x: x.contains("XX"), t.freq_index_dict.keys()), ) freq_idx_range = hl.range(hl.len(t.freq_meta)) new_freq_expr = hl.if_else( (t.locus.in_y_nonpar() | t.locus.in_y_par()), hl.map( lambda x: hl.if_else(female_idx.contains(x), missing_callstats_expr(), t.freq[x]), freq_idx_range, ), t.freq, ) return new_freq_expr
def test_ndarray_map(): a = hl._ndarray([[2, 3, 4], [5, 6, 7]]) b = hl.map(lambda x: -x, a) c = hl.map(lambda x: True, a) assert_ndarrays_eq( (b, [[-2, -3, -4], [-5, -6, -7]]), (c, [[True, True, True], [True, True, True]]))
def test_ndarray_map(): a = hl.nd.array([[2, 3, 4], [5, 6, 7]]) b = hl.map(lambda x: -x, a) c = hl.map(lambda x: True, a) assert_ndarrays_eq((b, [[-2, -3, -4], [-5, -6, -7]]), (c, [[True, True, True], [True, True, True]])) assert hl.eval(hl.null(hl.tndarray(hl.tfloat, 1)).map(lambda x: x * 2)) is None
def impute_sex_aggregator(call, aaf, aaf_threshold=0.0, include_par=False, female_threshold=0.4, male_threshold=0.8) -> hl.Table: """:func:`.impute_sex` as an aggregator.""" mt = call._indices.source rg = mt.locus.dtype.reference_genome x_contigs = hl.literal( hl.eval( hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg), rg.x_contigs))) inbreeding = hl.agg.inbreeding(call, aaf) is_female = hl.if_else( inbreeding.f_stat < female_threshold, True, hl.if_else(inbreeding.f_stat > male_threshold, False, hl.is_missing('tbool'))) expression = hl.struct(is_female=is_female, **inbreeding) if not include_par: interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg))) par_intervals = hl.literal(rg.par, interval_type) expression = hl.agg.filter( ~par_intervals.any( lambda par_interval: par_interval.contains(mt.locus)), expression) expression = hl.agg.filter( (aaf > aaf_threshold) & (aaf < (1 - aaf_threshold)), expression) expression = hl.agg.filter( x_contigs.any(lambda contig: contig.contains(mt.locus)), expression) return expression
def match_variants(gwas, reference): """ Groups `gwas` and `reference` by the row key (e.g. locus) and then compares the allele fields to determine whether a strand flip has occurred. Assumes `gwas` row key and `reference` row key are the same type. Assumes both `gwas` and `reference` have a row field `alleles` of type <array<str>>. `alleles` cannot be a row key of the dataset. """ def has_field_of_type(source, name, dtype): return name in source.row and source[name].dtype == dtype if not has_field_of_type(gwas, 'alleles', hl.tarray(hl.tstr)): raise TypeError( "'gwas' must have a row field 'alleles' with type <array<str>>") if not has_field_of_type(reference, 'alleles', hl.tarray(hl.tstr)): raise TypeError( "'reference' must have a row field 'alleles' with type <array<str>>" ) if 'alleles' in gwas.key: raise TypeError("'alleles' cannot be a row key in 'gwas'.") if 'alleles' in reference.key: raise TypeError("'alleles' cannot be a row key in 'reference'.") reference = reference.collect_by_key() matched = gwas.annotate(matches=hl.map( lambda x: x.annotate(match_alleles=add_strand_flip_annotation( x.alleles[0], x.alleles[1], gwas.alleles[0], gwas.alleles[1])), reference[gwas.key].values)) return matched
def _dumps_partitions(partitions, row_key_type): parts_type = partitions.dtype if not (isinstance(parts_type, hl.tarray) and isinstance(parts_type.element_type, hl.tinterval)): raise ValueError( f'partitions type invalid: {parts_type} must be array of intervals' ) point_type = parts_type.element_type.point_type f1, t1 = next(iter(row_key_type.items())) if point_type == t1: partitions = hl.map( lambda x: hl.interval(start=hl.struct(**{f1: x.start}), end=hl.struct(**{f1: x.end}), includes_start=x.includes_start, includes_end=x.includes_end), partitions) else: if not isinstance(point_type, hl.tstruct): raise ValueError( f'partitions has wrong type: {point_type} must be struct or type of first row key field' ) if not point_type._is_prefix_of(row_key_type): raise ValueError( f'partitions type invalid: {point_type} must be prefix of {row_key_type}' ) s = json.dumps(partitions.dtype._convert_to_json(hl.eval(partitions))) return s, partitions.dtype
def add_popmax_expr(freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, populations: Set[str]) -> hl.expr.ArrayExpression: """ Calculates popmax (add an additional entry into freq with popmax: pop) :param ArrayExpression freq: ArrayExpression of Structs with ['ac', 'an', 'hom'] :param ArrayExpression freq_meta: ArrayExpression of meta dictionaries corresponding to freq :param set of str populations: Set of populations over which to calculate popmax :return: Frequency data with annotated popmax :rtype: ArrayExpression """ pops_to_use = hl.literal(populations) freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(freq, freq_meta)) freq_filtered = hl.filter( lambda f: (f.meta.size() == 2) & (f.meta.get('group') == 'adj') & pops_to_use.contains(f.meta.get('pop')) & (f.AC > 0), freq) sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True) return hl.or_missing( hl.len(sorted_freqs) > 0, hl.struct(AC=sorted_freqs[0].AC, AF=sorted_freqs[0].AF, AN=sorted_freqs[0].AN, homozygote_count=sorted_freqs[0].homozygote_count, pop=sorted_freqs[0].meta['pop']))
def pre_process_subset_freq(subset: str, global_ht: hl.Table, test: bool = False) -> hl.Table: """ Prepare subset frequency Table by filling in missing frequency fields for loci present only in the global cohort. .. note:: The resulting final `freq` array will be as long as the subset `freq_meta` global (i.e., one `freq` entry for each `freq_meta` entry) :param subset: subset ID :param global_ht: Hail Table containing all variants discovered in the overall release cohort :param test: If True, filter to small region on chr20 :return: Table containing subset frequencies with missing freq structs filled in """ # Read in subset HTs subset_ht_path = get_freq(subset=subset).path subset_chr20_ht_path = qc_temp_prefix() + f"chr20_test_freq.{subset}.ht" if test: if file_exists(subset_chr20_ht_path): logger.info( "Loading chr20 %s subset frequency data for testing: %s", subset, subset_chr20_ht_path, ) subset_ht = hl.read_table(subset_chr20_ht_path) elif file_exists(subset_ht_path): logger.info( "Loading %s subset frequency data for testing: %s", subset, subset_ht_path, ) subset_ht = hl.read_table(subset_ht_path) subset_ht = hl.filter_intervals( subset_ht, [hl.parse_locus_interval("chr20:1-1000000")]) elif file_exists(subset_ht_path): logger.info("Loading %s subset frequency data: %s", subset, subset_ht_path) subset_ht = hl.read_table(subset_ht_path) else: raise DataException( f"Hail Table containing {subset} subset frequencies not found. You may need to run the script generate_freq_data.py to generate frequency annotations first." ) # Fill in missing freq structs ht = subset_ht.join(global_ht.select().select_globals(), how="right") ht = ht.annotate(freq=hl.if_else( hl.is_missing(ht.freq), hl.map(lambda x: missing_callstats_expr(), hl.range(hl.len(ht.freq_meta))), ht.freq, )) return ht
def _coerce(self, x: Expression): assert isinstance(x, hl.expr.DictExpression) if not self.kc._requires_conversion(x.dtype.key_type): # fast path return x.map_values(self.vc.coerce) else: return hl.dict(hl.map(lambda e: (self.kc.coerce(e[0]), self.vc.coerce(e[1])), hl.array(x)))
def _coerce(self, x: Expression): assert isinstance(x, hl.expr.DictExpression) if not self.kc._requires_conversion(x.dtype.key_type): # fast path return x.map_values(self.vc.coerce) else: return hl.dict(hl.map(lambda e: (self.kc.coerce(e[0]), self.vc.coerce(e[1])), hl.array(x)))
def gt_to_gp(mt, location: str = 'GP'): return mt.annotate_entries( **{ location: hl.or_missing( hl.is_defined(mt.GT), hl.map( lambda i: hl.cond(mt.GT.unphased_diploid_gt_index() == i, 1.0, 0.0), hl.range(0, hl.triangle(hl.len(mt.alleles))))) })
def densify(sparse_mt): """Convert sparse matrix table to a dense VCF-like representation by expanding reference blocks. Parameters ---------- sparse_mt : :class:`.MatrixTable` Sparse MatrixTable to densify. The first row key field must be named ``locus`` and have type ``locus``. Must have an ``END`` entry field of type ``int32``. Returns ------- :class:`.MatrixTable` The densified MatrixTable. The ``END`` entry field is dropped. While computationally expensive, this operation is necessary for many downstream analyses, and should be thought of as roughly costing as much as reading a matrix table created by importing a dense project VCF. """ if list(sparse_mt.row_key)[0] != 'locus' or not isinstance( sparse_mt.locus.dtype, hl.tlocus): raise ValueError( "first row key field must be named 'locus' and have type 'locus'") if 'END' not in sparse_mt.entry or sparse_mt.END.dtype != hl.tint32: raise ValueError( "'densify' requires 'END' entry field of type 'int32'") col_key_fields = list(sparse_mt.col_key) contigs = sparse_mt.locus.dtype.reference_genome.contigs contig_idx_map = hl.literal({contigs[i]: i for i in range(len(contigs))}, 'dict<str, int32>') mt = sparse_mt.annotate_rows( __contig_idx=contig_idx_map[sparse_mt.locus.contig]) mt = mt.annotate_entries(__contig=mt.__contig_idx) t = mt._localize_entries('__entries', '__cols') t = t.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan._prev_nonnull( hl.or_missing(hl.is_defined(entry.END), entry)), t.__entries), lambda prev_entries: hl.map( lambda i: hl.rbind( prev_entries[i], t.__entries[i], lambda prev_entry, entry: hl. cond((~hl.is_defined(entry) & (prev_entry.END >= t.locus.position) & (prev_entry.__contig == t.__contig_idx)), prev_entry, entry)), hl.range(0, hl.len(t.__entries))))) mt = t._unlocalize_entries('__entries', '__cols', col_key_fields) mt = mt.drop('__contig_idx', '__contig', 'END') return mt
def post_process_gene_map_ht(gene_ht): groups = [ 'pLoF', 'missense|LC', 'pLoF|missense|LC', 'synonymous', 'missense' ] variant_groups = hl.map( lambda group: group.split('\\|').flatmap(lambda csq: gene_ht.variants. get(csq)), groups) gene_ht = gene_ht.transmute(variant_groups=hl.zip( groups, variant_groups)).explode('variant_groups') gene_ht = gene_ht.transmute(annotation=gene_ht.variant_groups[0], variants=hl.sorted(gene_ht.variant_groups[1])) gene_ht = gene_ht.key_by(start=gene_ht.interval.start) return gene_ht.filter(hl.len(gene_ht.variants) > 0)
def vcf_to_mt(splice_ai_snvs_path, splice_ai_indels_path, genome_version): """ Loads the snv path and indels source path to a matrix table and returns the table. :param splice_ai_snvs_path: source location :param splice_ai_indels_path: source location :return: matrix table """ logger.info("==> reading in splice_ai vcfs: %s, %s" % (splice_ai_snvs_path, splice_ai_indels_path)) # for 37, extract to MT, for 38, MT not included. interval = "1-MT" if genome_version == "37" else "chr1-chrY" contig_dict = None if genome_version == "38": contig_dict = NO_CHR_TO_CHR_CONTIG_RECODING mt = hl.import_vcf( [splice_ai_snvs_path, splice_ai_indels_path], reference_genome=f"GRCh{genome_version}", contig_recoding=contig_dict, force_bgz=True, min_partitions=10000, skip_invalid_loci=True, ) interval = [ hl.parse_locus_interval(interval, reference_genome=f"GRCh{genome_version}") ] mt = hl.filter_intervals(mt, interval) # Split SpliceAI field by | delimiter. Capture delta score entries and map to floats delta_scores = mt.info.SpliceAI[0].split(delim="\\|")[2:6] splice_split = mt.info.annotate( SpliceAI=hl.map(lambda x: hl.float32(x), delta_scores)) mt = mt.annotate_rows(info=splice_split) # Annotate info.max_DS with the max of DS_AG, DS_AL, DS_DG, DS_DL in info. # delta_score array is |DS_AG|DS_AL|DS_DG|DS_DL consequences = hl.literal( ["Acceptor gain", "Acceptor loss", "Donor gain", "Donor loss"]) mt = mt.annotate_rows(info=mt.info.annotate( max_DS=hl.max(mt.info.SpliceAI))) mt = mt.annotate_rows(info=mt.info.annotate(splice_consequence=hl.if_else( mt.info.max_DS > 0, consequences[mt.info.SpliceAI.index(mt.info.max_DS)], "No consequence", ))) return mt
def get_expr_for_vep_all_consequences_array(vep_root): retval = vep_root.annotate(formattedIntergenic=hl.map( lambda x: hl.struct( allele_num=x.allele_num, amino_acids=hl.null(hl.tstr), biotype=hl.literal("intron"), canonical=hl.null(hl.tint32), ccds=hl.null(hl.tstr), cdna_start=hl.null(hl.tint32), cdna_end=hl.null(hl.tint32), cds_end=hl.null(hl.tint32), cds_start=hl.null(hl.tint32), codons=hl.null(hl.tstr), consequence_terms=x.consequence_terms, distance=hl.null(hl.tint32), # not na..., domains=hl.null(hl.tarray(hl.tstruct(db=hl.tstr, name=hl.tstr))), exon=hl.null(hl.tstr), gene_id=hl.null(hl.tstr), gene_pheno=hl.null(hl.tint32), # not na..., gene_symbol=hl.null(hl.tstr), gene_symbol_source=hl.null(hl.tstr), hgnc_id=hl.null(hl.tstr), hgvsc=hl.null(hl.tstr), hgvsp=hl.null(hl.tstr), hgvs_offset=hl.null(hl.tint32), impact=x.impact, intron=hl.null(hl.tstr), lof=hl.null(hl.tstr), lof_flags=hl.null(hl.tstr), lof_filter=hl.null(hl.tstr), lof_info=hl.null(hl.tstr), minimised=x.minimised, polyphen_prediction=hl.null(hl.tstr), polyphen_score=hl.null(hl.tfloat64), protein_end=hl.null(hl.tint32), protein_start=hl.null(hl.tint32), protein_id=hl.null(hl.tstr), sift_prediction=hl.null(hl.tstr), sift_score=hl.null(hl.tfloat64), strand=hl.null(hl.tint32), swissprot=hl.null(hl.tstr), transcript_id=hl.null(hl.tstr), trembl=hl.null(hl.tstr), uniparc=hl.null(hl.tstr), variant_allele=x.variant_allele), vep_root.intergenic_consequences)) vep_root = retval.annotate(all_consequences=hl.cond( hl.is_missing(retval.transcript_consequences), retval.formattedIntergenic, retval.transcript_consequences)) return vep_root
def densify(sparse_mt): """Convert sparse MatrixTable to a dense one. Parameters ---------- sparse_mt : :class:`.MatrixTable` Sparse MatrixTable to densify. The first row key field must be named ``locus`` and have type ``locus``. Must have an ``END`` entry field of type ``int32``. Returns ------- :class:`.MatrixTable` The densified MatrixTable. The ``END`` entry field is dropped. """ if list(sparse_mt.row_key)[0] != 'locus' or not isinstance( sparse_mt.locus.dtype, hl.tlocus): raise ValueError( "first row key field must be named 'locus' and have type 'locus'") if 'END' not in sparse_mt.entry or sparse_mt.END.dtype != hl.tint32: raise ValueError( "'densify' requires 'END' entry field of type 'int32'") col_key_fields = list(sparse_mt.col_key) contigs = sparse_mt.locus.dtype.reference_genome.contigs contig_idx_map = hl.literal({contigs[i]: i for i in range(len(contigs))}, 'dict<str, int32>') mt = sparse_mt.annotate_rows( __contig_idx=contig_idx_map[sparse_mt.locus.contig]) mt = mt.annotate_entries(__contig=mt.__contig_idx) t = mt._localize_entries('__entries', '__cols') t = t.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan._prev_nonnull( hl.or_missing(hl.is_defined(entry.END), entry)), t.__entries), lambda prev_entries: hl.map( lambda i: hl.rbind( prev_entries[i], t.__entries[i], lambda prev_entry, entry: hl. cond((~hl.is_defined(entry) & (prev_entry.END >= t.locus.position) & (prev_entry.__contig == t.__contig_idx)), prev_entry, entry)), hl.range(0, hl.len(t.__entries))))) mt = t._unlocalize_entries('__entries', '__cols', col_key_fields) mt = mt.drop('__contig_idx', '__contig', 'END') return mt
def densify(sparse_mt): """Convert sparse MatrixTable to a dense one. Parameters ---------- sparse_mt : :class:`.MatrixTable` Sparse MatrixTable to densify. The first row key field must be named ``locus`` and have type ``locus``. Must have an ``END`` entry field of type ``int32``. Returns ------- :class:`.MatrixTable` The densified MatrixTable. The ``END`` entry field is dropped. """ if list(sparse_mt.row_key)[0] != 'locus' or not isinstance(sparse_mt.locus.dtype, hl.tlocus): raise ValueError("first row key field must be named 'locus' and have type 'locus'") if 'END' not in sparse_mt.entry or sparse_mt.END.dtype != hl.tint32: raise ValueError("'densify' requires 'END' entry field of type 'int32'") col_key_fields = list(sparse_mt.col_key) mt = sparse_mt mt = sparse_mt.annotate_entries(__contig = mt.locus.contig) t = mt._localize_entries('__entries', '__cols') t = t.annotate( __entries = hl.rbind( hl.scan.array_agg( lambda entry: hl.scan._prev_nonnull(hl.or_missing(hl.is_defined(entry.END), entry)), t.__entries), lambda prev_entries: hl.map( lambda i: hl.rbind( prev_entries[i], t.__entries[i], lambda prev_entry, entry: hl.cond( (~hl.is_defined(entry) & (prev_entry.END >= t.locus.position) & (prev_entry.__contig == t.locus.contig)), prev_entry, entry)), hl.range(0, hl.len(t.__entries))))) mt = t._unlocalize_entries('__entries', '__cols', col_key_fields) mt = mt.drop('__contig', 'END') return mt
def vstack(arrs): """ Stack arrays in sequence vertically (row wise). 1-D arrays of shape `(N,)`, will reshaped to `(1,N)` before concatenation. For all other arrays, equivalent to :func:`.concatenate` with axis=0. Parameters ---------- arrs : sequence of :class:`.NDArrayExpression` The arrays must have the same shape along all but the first axis. 1-D arrays must have the same length. Returns ------- stacked : :class:`.NDArrayExpression` The array formed by stacking the given arrays, will be at least 2-D. See Also -------- :func:`.concatenate` : Join a sequence of arrays along an existing axis. Examples -------- >>> a = hl.nd.array([1, 2, 3]) >>> b = hl.nd.array([2, 3, 4]) >>> hl.eval(hl.nd.vstack((a,b))) array([[1, 2, 3], [2, 3, 4]], dtype=int32) >>> a = hl.nd.array([[1], [2], [3]]) >>> b = hl.nd.array([[2], [3], [4]]) >>> hl.eval(hl.nd.vstack((a,b))) array([[1], [2], [3], [2], [3], [4]], dtype=int32) """ head_ndim = arrs[0].ndim if head_ndim == 1: return concatenate(hl.map(lambda a: a._broadcast(2), arrs), 0) return concatenate(arrs, 0)
def all_and_leave_one_out(x, pop_array, all_f=hl.sum, loo_f=lambda i, x: hl.sum(x) - hl.or_else(x[i], 0)): """ Applies a function to an input array for all populations, and for each of leave-one-out populations. :param x: Input array :param pop_array: Population array :param all_f: Function for all populations. It takes the input array and returns a new value :param loo_f: Function for each of leave-one-out populations. It takes an index of leave-one-out population and the input array, and returns an array of new values. ... :return: Array of new values for all populations and for each of leave-one-out populations. :rtype: ArrayExpression """ arr = hl.array([all_f(x)]) arr = arr.extend(hl.map(lambda i: loo_f(i, x), hl.range(hl.len(pop_array)))) return hl.or_missing(hl.any(hl.is_defined, x), arr)
def match_variants(gwas_row_key, gwas_alleles, reference_row_key, reference_alleles): """ Assumes gwas row key and reference row key are same type and are keys of datasets already """ assert (gwas_row_key.dtype == reference_row_key.dtype ) # FIXME: Allow locus to be matched with tstruct(contig, pos) assert (gwas_row_key._indices == gwas_alleles._indices) assert (reference_row_key._indices == reference_alleles._indices) # FIXME: assert all row indices reference_alleles_fn = reference_alleles._ast.name gwas = gwas_row_key._indices.source reference = reference_row_key._indices.source reference = reference.collect_by_key() matched_t = gwas.annotate(matches=hl.map( lambda x: x.annotate(match_alleles=add_strand_flip_annotation( x[reference_alleles_fn][0], x[reference_alleles_fn][1], gwas_alleles[0], gwas_alleles[1])), reference[gwas_row_key].values)) # t = matched_t.annotate(matches=hl.map(lambda x: x.annotate(flip=add_strand_flip_annotation(x[???], x[???], matched_t[???], matched_t[???])) , matched_t.matches)) # t = t.select(t.locus, t.A1, t.A2, matches = hl.map(lambda x: x, t.matches)) # Table(gwas_row_key, # matches[struct{ # ref_index = int, # ref_locus = locus, # ref_rsid = str, # ref_alleles = (ref,alt), # allele_swap = bool, # strand_flip = bool # }, ... # ] # ) return matched_t
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable: """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio. .. include:: ../_templates/req_tstring.rst Examples -------- Create a trio matrix: >>> pedigree = hl.Pedigree.read('data/case_control_study.fam') >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True) Notes ----- This method builds a new matrix table with one column per trio. If `complete_trios` is ``True``, then only trios that satisfy :meth:`.Trio.is_complete` are included. In this new dataset, the column identifiers are the sample IDs of the trio probands. The column fields and entries of the matrix are changed in the following ways: The new column fields consist of three structs (`proband`, `father`, `mother`), a Boolean field, and a string field: - **proband** (:class:`.tstruct`) - Column fields on the proband. - **father** (:class:`.tstruct`) - Column fields on the father. - **mother** (:class:`.tstruct`) - Column fields on the mother. - **id** (:py:data:`.tstr`) - Column key for the proband. - **is_female** (:py:data:`.tbool`) - Proband is female. ``True`` for female, ``False`` for male, missing if unknown. - **fam_id** (:py:data:`.tstr`) - Family ID. The new entry fields are: - **proband_entry** (:class:`.tstruct`) - Proband entry fields. - **father_entry** (:class:`.tstruct`) - Father entry fields. - **mother_entry** (:class:`.tstruct`) - Mother entry fields. Parameters ---------- pedigree : :class:`.Pedigree` Returns ------- :class:`.MatrixTable` """ mt = dataset require_col_key_str(mt, "trio_matrix") k = mt.col_key.dtype.fields[0] samples = mt[k].collect() pedigree = pedigree.filter_to(samples) trios = pedigree.complete_trios() if complete_trios else pedigree.trios n_trios = len(trios) sample_idx = {} for i, s in enumerate(samples): sample_idx[s] = i trios = [ hl.Struct(id=sample_idx[t.s], pat_id=None if t.pat_id is None else sample_idx[t.pat_id], mat_id=None if t.mat_id is None else sample_idx[t.mat_id], is_female=t.is_female, fam_id=t.fam_id) for t in trios ] trios_type = hl.dtype( 'array<struct{id:int,pat_id:int,mat_id:int,is_female:bool,fam_id:str}>' ) trios_sym = Env.get_uid() entries_sym = Env.get_uid() cols_sym = Env.get_uid() mt = mt.annotate_globals(**{trios_sym: hl.literal(trios, trios_type)}) mt = mt._localize_entries(entries_sym, cols_sym) mt = mt.annotate_globals( **{ cols_sym: hl.map( lambda i: hl.bind( lambda t: hl.struct(id=mt[cols_sym][t.id][k], proband=mt[cols_sym][t.id], father=mt[cols_sym][t.pat_id], mother=mt[cols_sym][t.mat_id], is_female=t.is_female, fam_id=t.fam_id), mt[trios_sym][i]), hl.range(0, n_trios)) }) mt = mt.annotate( **{ entries_sym: hl.map( lambda i: hl.bind( lambda t: hl.struct(proband_entry=mt[entries_sym][t.id], father_entry=mt[entries_sym][t.pat_id], mother_entry=mt[entries_sym][t.mat_id] ), mt[trios_sym][i]), hl.range(0, n_trios)) }) mt = mt.drop(trios_sym) return mt._unlocalize_entries(entries_sym, cols_sym, ['id'])
def main(args): hl.init() # Read in all sumstats mt = load_final_sumstats_mt(filter_phenos=True, filter_variants=False, filter_sumstats=True, separate_columns_by_pop=False, annotate_with_nearest_gene=False) # Annotate per-entry sample size def get_n(pheno_data, i): return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0) mt = mt.annotate_entries(summary_stats=hl.map( lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]), get_n(mt.pheno_data, x[0]))), hl.zip_with_index(mt.summary_stats))) # Exclude entries with low confidence flag. if not args.keep_low_confidence_variants: mt = mt.annotate_entries(summary_stats=hl.map( lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats)) # Run fixed-effect meta-analysis (all + leave-one-out) mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA / (mt.summary_stats.SE**2), inv_se2=1 / (mt.summary_stats.SE**2)) mt = mt.annotate_entries( sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta, mt.pheno_data.pop), sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop)) mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2, META_SE=hl.map(lambda x: hl.sqrt(1 / x), mt.sum_inv_se2)) mt = mt.annotate_entries( META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA / mt.META_SE))) # Run heterogeneity test (Cochran's Q) mt = mt.annotate_entries(META_Q=hl.map( lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2), mt.META_BETA), variant_exists=hl.map(lambda x: ~hl.is_missing(x), mt.summary_stats.BETA)) mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out( mt.variant_exists, mt.pheno_data.pop)) mt = mt.annotate_entries(META_Pvalue_het=hl.map( lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1), hl.range(hl.len(mt.META_Q)))) # Add other annotations mt = mt.annotate_entries( ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats), ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats), META_AC_Allele2=all_and_leave_one_out( mt.summary_stats.AF_Allele2 * mt.summary_stats.N, mt.pheno_data.pop), META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop)) mt = mt.annotate_entries( META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N, META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) / mt.META_N, META_AF_Controls=all_and_leave_one_out(mt.ac_controls, mt.pheno_data.pop) / mt.META_N) mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases', 'ac_controls', 'summary_stats', 'META_AC_Allele2') # Format everything into array<struct> def is_finite_or_missing(x): return (hl.or_missing(hl.is_finite(x), x)) meta_fields = [ 'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2', 'AF_Cases', 'AF_Controls' ] mt = mt.transmute_entries(meta_analysis=hl.map( lambda i: hl.struct( **{ field: is_finite_or_missing(mt[f'META_{field}'][i]) for field in meta_fields }), hl.range(hl.len(mt.META_BETA)))) col_fields = ['n_cases', 'n_controls'] mt = mt.annotate_cols( **{ field: all_and_leave_one_out(mt.pheno_data[field], mt.pheno_data.pop) for field in col_fields }) col_fields += ['pop'] mt = mt.annotate_cols(pop=all_and_leave_one_out( mt.pheno_data.pop, mt.pheno_data.pop, all_f=lambda x: x, loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x), )) mt = mt.transmute_cols(meta_analysis_data=hl.map( lambda i: hl.struct(**{field: mt[field][i] for field in col_fields}), hl.range(hl.len(mt.pop)))) mt.describe() mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite) hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = ld_score_all_phenos_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = ld_score_one_pheno_sumstats >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={ '__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0) }, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={ '__y': chi_sq_exprs[0], '__n': n_samples_exprs[0] }) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows( hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict( **{ '__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr }, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, ** {n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ ds.select( **{ '__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]]) }) for i, y in enumerate(ys) ] mts = [ ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=[ '__w_initial', '__w_initial_floor', '__x', '__x_floor' ]) for ht in hts ] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows( hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg(lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[ i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl .rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map(lambda s1: step1_idx >= s1, step1_separators )) - 1, lambda is_separator, step1_block: entry.annotate(__step1_block=step1_block, __step2_block=hl.cond( ~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x) ]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max( hl.min(mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M ]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter( (mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta), hl.range(n_blocks))) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected )) - hl.sum( hl.map(lambda x: x[i], mt. __step1_block_betas_bias_corrected))** 2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0 / (mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + +mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter( mt.__in_step2, hl.agg.linreg( y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) ]) mt = mt.annotate_cols(__step2_h2=hl.max( hl.min(mt.__step2_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n) / M ]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=hl.agg.array_agg( lambda i: hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]), hl.range(n_blocks))) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean(mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0 / (mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + +mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols(__final_block_betas_bias_corrected=( n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected) ], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks ]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct(estimate=mt.__final_betas[0], standard_error=hl.sqrt( mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M / hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M / hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
'Siblings': 'sibling_ids', 'Second Order': 'second_order_relationship_ids', 'Third Order': 'third_order_relationship_ids', 'Children': 'children_ids' }) ht_relationships = ht_relationships.annotate( paternal_id=hl.or_missing(ht_relationships['paternal_id'] != '0', ht_relationships['paternal_id']), maternal_id=hl.or_missing(ht_relationships['maternal_id'] != '0', ht_relationships['maternal_id']), relationship_role=hl.cond(ht_relationships['relationship_role'] == 'unrel', 'unrelated', ht_relationships['relationship_role']), sibling_ids=hl.or_missing( ht_relationships['sibling_ids'] == '0', hl.map(lambda x: x.strip(), ht_relationships['sibling_ids'].split(','))), children_ids=hl.or_missing( ht_relationships['children_ids'] == '0', hl.map(lambda x: x.strip(), ht_relationships['children_ids'].split(','))), second_order_relationship_ids=hl.or_missing( ht_relationships['second_order_relationship_ids'] == '0', hl.map(lambda x: x.strip(), ht_relationships['second_order_relationship_ids'].split(','))), third_order_relationship_ids=hl.or_missing( ht_relationships['third_order_relationship_ids'] == '0', hl.map(lambda x: x.strip(), ht_relationships['third_order_relationship_ids'].split(',')))) ht_relationships = ht_relationships.key_by('s') ht_relationships = ht_relationships.select('family_id', 'relationship_role', 'maternal_id', 'paternal_id',
def import_gtf(path, key=None): """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will include the following row fields: .. code-block:: text 'seqname': str 'source': str 'feature': str 'start': int32 'end': int32 'score': float64 'strand': str 'frame': int32 There will also be corresponding fields for every tag found in the attribute field of the GTF file. .. note:: The "end" field in the table will be incremented by 1 in comparison to the value found in the GTF file, as the end coordinate in a GTF file is inclusive while the end coordinate in Hail is exclusive. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', key='gene_id') >>> ht.describe() .. code-block:: text ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'seqname': str 'source': str 'feature': str 'start': int32 'end': int32 'score': float64 'strand': str 'frame': int32 'havana_gene': str 'exon_id': str 'havana_transcript': str 'transcript_name': str 'gene_type': str 'tag': str 'transcript_status': str 'exon_number': str 'level': str 'transcript_id': str 'transcript_type': str 'gene_id': str 'gene_name': str 'gene_status': str ---------------------------------------- Key: ['gene_id'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. key : :obj:`str` or :obj:`list` of :obj:`str` Key field(s). Can be tag name(s) found in the attribute field of the GTF file. Returns ------- :class:`.Table` """ ht = hl.import_table(path, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(end=ht['end'] + 1) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = list(ht.aggregate( hl.set(hl.flatten(hl.agg.collect(ht['attribute'].keys()))))) ht = ht.annotate(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes}) ht = ht.drop(ht['attribute']) if key: key = wrap_to_list(key) ht = ht.key_by(*key) return ht
def _coerce(self, x: Expression): assert isinstance(x, hl.expr.SetExpression) return hl.map(lambda x_: self.ec.coerce(x_), x)
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +SKIP_OUTPUT_CHECK ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={ 'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint }, missing='.', delimiter='\t') ht = ht.rename({ 'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute' }) ht = ht.annotate(attribute=hl.dict( hl.map( lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', ''). replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute( **{ x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x }) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case().when( ht['seqname'].startswith('HLA'), ht['seqname']).when( ht['seqname'].startswith('chrHLA'), ht['seqname'].replace( '^chr', '')).when(ht['seqname'].startswith( 'chr'), ht['seqname']).default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal( set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute( interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval( hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'phase', 'f8': 'attributes' }) ht_transcripts = ht_transcripts.filter( ht_transcripts.feature_type == 'transcript') ht_transcripts = ht_transcripts.annotate(interval=hl.interval( hl.locus(ht_transcripts.contig, ht_transcripts.start, 'GRCh37'), hl.locus(ht_transcripts.contig, ht_transcripts.end + 1, 'GRCh37'))) ht_transcripts = ht_transcripts.annotate(attributes=hl.dict( hl.map( lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace( ';$', '')), ht_transcripts.attributes.split('; ')))) attribute_cols = list( ht_transcripts.aggregate( hl.set(hl.flatten(hl.agg.collect(ht_transcripts.attributes.keys()))))) ht_transcripts = ht_transcripts.annotate( **{ x: hl.or_missing(ht_transcripts.attributes.contains(x), ht_transcripts.attributes[x]) for x in attribute_cols }) ht_transcripts = ht_transcripts.select(*([ 'transcript_id', 'transcript_name', 'transcript_type', 'strand', 'transcript_status', 'havana_transcript', 'ccdsid', 'ont', 'gene_name', 'interval', 'gene_type', 'annotation_source', 'havana_gene', 'gene_status', 'tag'
ht_transcripts = hl.import_table('gs://hail-datasets/raw-data/gtex/v7/reference/gencode.v19.transcripts.patched_contigs.gtf', comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint}, missing='.', min_partitions=12) ht_transcripts = ht_transcripts.rename({'f0': 'contig', 'f1': 'annotation_source', 'f2': 'feature_type', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'phase', 'f8': 'attributes'}) ht_transcripts = ht_transcripts.filter(ht_transcripts.feature_type == 'transcript') ht_transcripts = ht_transcripts.annotate(interval=hl.interval(hl.locus(ht_transcripts.contig, ht_transcripts.start, 'GRCh37'), hl.locus(ht_transcripts.contig, ht_transcripts.end + 1, 'GRCh37'))) ht_transcripts = ht_transcripts.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht_transcripts.attributes.split('; ')))) attribute_cols = list(ht_transcripts.aggregate(hl.set(hl.flatten(hl.agg.collect(ht_transcripts.attributes.keys()))))) ht_transcripts = ht_transcripts.annotate(**{x: hl.or_missing(ht_transcripts.attributes.contains(x), ht_transcripts.attributes[x]) for x in attribute_cols}) ht_transcripts = ht_transcripts.select(*(['transcript_id', 'transcript_name', 'transcript_type', 'strand', 'transcript_status', 'havana_transcript', 'ccdsid', 'ont', 'gene_name', 'interval', 'gene_type', 'annotation_source', 'havana_gene', 'gene_status', 'tag'])) ht_transcripts = ht_transcripts.rename({'havana_transcript': 'havana_transcript_id', 'havana_gene': 'havana_gene_id'}) ht_transcripts = ht_transcripts.key_by(ht_transcripts.transcript_id) mt = hl.import_matrix_table('gs://hail-datasets/raw-data/gtex/v7/rna-seq/processed/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_expected_count.tsv.bgz', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr}, row_key='transcript_id', missing='', entry_type=hl.tfloat) mt = mt.annotate_cols(sample_id=mt.col_id) mt = mt.key_cols_by(mt.sample_id) mt = mt.annotate_entries(read_count=hl.int(mt.x)) mt = mt.drop(mt.col_id, mt.x)
get_xpos(rows.info.CHR2, rows.info.END2), get_xpos(rows.locus.contig, rows.info.END)), 'svType': lambda rows: rows.sv_type[0], 'transcriptConsequenceTerms': lambda rows: [rows.sv_type[0]], 'sv_type_detail': lambda rows: hl.if_else( rows.sv_type[0] == 'CPX', rows.info.CPX_TYPE, hl.if_else( (rows.sv_type[0] == 'INS') & (hl.len(rows.sv_type) > 1), rows.sv_type[1], hl.missing('str'))), 'geneIds': lambda rows: hl.set( hl.map( lambda x: x.gene_id, rows.sortedTranscriptConsequences.filter( lambda x: x.predicted_consequence != 'NEAREST_TSS'))), 'samples_no_call': lambda rows: get_sample_num_alt_x(rows, -1), 'samples_num_alt_1': lambda rows: get_sample_num_alt_x(rows, 1), 'samples_num_alt_2': lambda rows: get_sample_num_alt_x(rows, 2), } FIELDS = list(CORE_FIELDS.keys()) + list(DERIVED_FIELDS.keys()) + [ 'variantId', 'sortedTranscriptConsequences', 'genotypes' ] def get_xpos(contig, pos):
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def export(self, path, delimiter='\t', missing='NA', header=True): """Export a field to a text file. Examples -------- >>> small_mt.GT.export('output/gt.tsv') >>> with open('output/gt.tsv', 'r') as f: ... for line in f: ... print(line, end='') locus alleles 0 1 2 3 1:1 ["A","C"] 0/1 0/1 0/0 0/0 1:2 ["A","C"] 1/1 0/1 1/1 1/1 1:3 ["A","C"] 1/1 0/1 0/1 0/0 1:4 ["A","C"] 1/1 0/1 1/1 1/1 <BLANKLINE> >>> small_mt.GT.export('output/gt-no-header.tsv', header=False) >>> with open('output/gt-no-header.tsv', 'r') as f: ... for line in f: ... print(line, end='') 1:1 ["A","C"] 0/1 0/1 0/0 0/0 1:2 ["A","C"] 1/1 0/1 1/1 1/1 1:3 ["A","C"] 1/1 0/1 0/1 0/0 1:4 ["A","C"] 1/1 0/1 1/1 1/1 <BLANKLINE> >>> small_mt.pop.export('output/pops.tsv') >>> with open('output/pops.tsv', 'r') as f: ... for line in f: ... print(line, end='') sample_idx pop 0 2 1 2 2 0 3 2 <BLANKLINE> >>> small_mt.ancestral_af.export('output/ancestral_af.tsv') >>> with open('output/ancestral_af.tsv', 'r') as f: ... for line in f: ... print(line, end='') locus alleles ancestral_af 1:1 ["A","C"] 5.3905e-01 1:2 ["A","C"] 8.6768e-01 1:3 ["A","C"] 4.3765e-01 1:4 ["A","C"] 7.6300e-01 <BLANKLINE> >>> mt = small_mt >>> small_mt.bn.export('output/bn.tsv') >>> with open('output/bn.tsv', 'r') as f: ... for line in f: ... print(line, end='') bn {"n_populations":3,"n_samples":4,"n_variants":4,"n_partitions":8,"pop_dist":[1,1,1],"fst":[0.1,0.1,0.1],"mixture":false} <BLANKLINE> Notes ----- For entry-indexed expressions, if there is one column key field, the result of calling :func:`~hail.expr.functions.str` on that field is used as the column header. Otherwise, each compound column key is converted to JSON and used as a column header. For example: >>> small_mt = small_mt.key_cols_by(s=small_mt.sample_idx, family='fam1') >>> small_mt.GT.export('output/gt-no-header.tsv') >>> with open('output/gt-no-header.tsv', 'r') as f: ... for line in f: ... print(line, end='') locus alleles {"s":0,"family":"fam1"} {"s":1,"family":"fam1"} {"s":2,"family":"fam1"} {"s":3,"family":"fam1"} 1:1 ["A","C"] 0/1 0/1 0/0 0/0 1:2 ["A","C"] 1/1 0/1 1/1 1/1 1:3 ["A","C"] 1/1 0/1 0/1 0/0 1:4 ["A","C"] 1/1 0/1 1/1 1/1 <BLANKLINE> Parameters ---------- path : :class:`str` The path to which to export. delimiter : :class:`str` The string for delimiting columns. missing : :class:`str` The string to output for missing values. header : :obj:`bool` When ``True`` include a header line. """ uid = Env.get_uid() self_name, ds = self._to_relational_preserving_rows_and_cols(uid) if isinstance(ds, hl.Table): ds.export(output=path, delimiter=delimiter, header=header) else: assert len(self._indices.axes) == 2 entries, cols = Env.get_uid(), Env.get_uid() t = ds.select_cols().localize_entries(entries, cols) t = t.order_by(*t.key) output_col_name = Env.get_uid() entry_array = t[entries] if self_name: entry_array = hl.map(lambda x: x[self_name], entry_array) entry_array = hl.map( lambda x: hl.if_else(hl.is_missing(x), missing, hl.str(x)), entry_array) file_contents = t.select( **{k: hl.str(t[k]) for k in ds.row_key}, **{output_col_name: hl.delimit(entry_array, delimiter)}) if header: col_key = t[cols] if len(ds.col_key) == 1: col_key = hl.map(lambda x: x[0], col_key) column_names = hl.map(hl.str, col_key).collect(_localize=False)[0] header_table = hl.utils.range_table(1).key_by().select( **{k: k for k in ds.row_key}, **{output_col_name: hl.delimit(column_names, delimiter)}) file_contents = header_table.union(file_contents) file_contents.export(path, delimiter=delimiter, header=False)
def _coerce(self, x: Expression): assert isinstance(x, hl.expr.SetExpression) return hl.map(lambda x_: self.ec.coerce(x_), x)
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +NOTEST ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x}) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case() .when(ht['seqname'].startswith('HLA'), ht['seqname']) .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', '')) .when(ht['seqname'].startswith('chr'), ht['seqname']) .default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute(interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
'SMRRNANM', 'SMVQCFL', 'SMTRSCPT', 'SMMPPDPR', 'SMCGLGTH', 'SMUNPDRD', 'SMMPPDUN', 'SME2ANTI', 'SMALTALG', 'SME2SNSE', 'SMMFLGTH', 'SMSPLTRD', 'SME1ANTI', 'SME1SNSE', 'SMNUM5CD'] ht_samples = ht_samples.annotate(**{x: hl.float(ht_samples[x]) for x in float_cols}) ht_samples = ht_samples.annotate(**{x: hl.int(ht_samples[x].replace('.0$', '')) for x in int_cols}) ht = ht.filter(ht.feature_type == 'gene') ht = ht.annotate(interval=hl.interval(hl.locus(ht['contig'], ht['start'], 'GRCh37'), hl.locus(ht['contig'], ht['end'] + 1, 'GRCh37'))) ht = ht.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attributes'].split('; ')))) attribute_cols = list(ht.aggregate(hl.set(hl.flatten(hl.agg.collect(ht.attributes.keys()))))) ht = ht.annotate(**{x: hl.or_missing(ht_genes.attributes.contains(x), ht_genes.attributes[x]) for x in attribute_cols}) ht = ht.select(*(['gene_id', 'interval', 'gene_type', 'strand', 'annotation_source', 'havana_gene', 'gene_status', 'tag'])) ht = ht.rename({'havana_gene': 'havana_gene_id'}) ht = ht.key_by(ht_genes.gene_id) """