def get_group_to_counts_expr(k: hl.expr.StructExpression, counts: hl.expr.DictExpression) -> hl.expr.ArrayExpression: return hl.range(1, k.snv - 1, step=-1).flatmap( lambda snv: hl.range(0, k.all + 1).flatmap( lambda af: hl.range(0, k.csq + 1).map( lambda csq: hl.struct(snv=hl.bool(snv), all=hl.bool(af), csq=csq) ) ) ).filter( lambda key: counts.contains(key) ).map( lambda key: counts[key] )
def annotate_phen(tb, phen, sex, phen_tb_dict, filter_to_phen=True): r''' Annotates `tb` with phenotype `phen` and filters to individuals with phenotype defined. Uses sex-specific IRNT phenotypes. sex options: female, male, both_sexes ''' print( f'\n... Reading UKB phenotype "{phen_dict[phen][0]}" for {sex} (code: {phen}) ...' ) phen_tb0 = phen_tb_dict[sex] phen_tb = phen_tb0.select(phen).rename({phen: 'phen'}) if type(tb) == hl.table.Table: annotate_fn = hl.Table.annotate filter_fn = hl.Table.filter elif type(tb) == hl.matrixtable.MatrixTable: annotate_fn = hl.MatrixTable.annotate_cols filter_fn = hl.MatrixTable.filter_cols tb0 = annotate_fn(self=tb, phen_str=hl.str(phen_tb[tb.s]['phen']).replace('\"', '')) if filter_to_phen: # filter to individuals with phenotype data defined tb1 = filter_fn(self=tb0, expr=tb0.phen_str == '', keep=False) if phen_tb.phen.dtype == hl.dtype('bool'): tb2 = annotate_fn(self=tb1, phen=hl.bool(tb1.phen_str)).drop('phen_str') else: tb2 = annotate_fn(self=tb1, phen=hl.float64(tb1.phen_str)).drop('phen_str') return tb2
def compute_qc_metrics_residuals( ht: hl.Table, pc_scores: hl.expr.ArrayNumericExpression, qc_metrics: Dict[str, hl.expr.NumericExpression], use_pc_square: bool = True, n_pcs: Optional[int] = None, regression_sample_inclusion_expr: hl.expr.BooleanExpression = hl.bool( True), ) -> hl.Table: """ Compute QC metrics residuals after regressing out PCs (and optionally PC^2). .. note:: The `regression_sample_inclusion_expr` can be used to select a subset of the samples to include in the regression calculation. Residuals are always computed for all samples. :param ht: Input sample QC metrics HT :param pc_scores: The expression in the input HT that stores the PC scores :param qc_metrics: A dictionary with the name of each QC metric to compute residuals for and their corresponding expression in the input HT. :param use_pc_square: Whether to use PC^2 in the regression or not :param n_pcs: Numer of PCs to use. If not set, then all PCs in `pc_scores` are used. :param regression_sample_inclusion_expr: An optional expression to select samples to include in the regression calculation. :return: Table with QC metrics residuals """ # Annotate QC HT with fields necessary for computation _sample_qc_ht = ht.select(**qc_metrics, scores=pc_scores, _keep=regression_sample_inclusion_expr) # If n_pcs wasn't provided, use all PCs if n_pcs is None: n_pcs = _sample_qc_ht.aggregate( hl.agg.min(hl.len(_sample_qc_ht.scores))) logger.info( "Computing regressed QC metrics filters using %d PCs for metrics: %s", n_pcs, ", ".join(qc_metrics), ) # Prepare regression variables, adding 1.0 first for the intercept # Adds square of variables if use_pc_square is true x_expr = [1.0] + [_sample_qc_ht.scores[i] for i in range(0, n_pcs)] if use_pc_square: x_expr.extend([ _sample_qc_ht.scores[i] * _sample_qc_ht.scores[i] for i in range(0, n_pcs) ]) # Compute linear regressions lms = _sample_qc_ht.aggregate( hl.struct( **{ metric: hl.agg.filter( _sample_qc_ht._keep, hl.agg.linreg(y=_sample_qc_ht[metric], x=x_expr), ) for metric in qc_metrics })) _sample_qc_ht = _sample_qc_ht.annotate_globals(lms=lms).persist() # Compute residuals def get_lm_prediction_expr(metric: str): lm_pred_expr = _sample_qc_ht.lms[metric].beta[0] + hl.sum( hl.range(n_pcs).map(lambda i: _sample_qc_ht.lms[metric].beta[i + 1] * _sample_qc_ht.scores[i])) if use_pc_square: lm_pred_expr = lm_pred_expr + hl.sum( hl.range(n_pcs).map( lambda i: _sample_qc_ht.lms[metric].beta[i + n_pcs + 1] * _sample_qc_ht.scores[i] * _sample_qc_ht.scores[i])) return lm_pred_expr residuals_ht = _sample_qc_ht.select( **{ f"{metric}_residual": _sample_qc_ht[metric] - get_lm_prediction_expr(metric) for metric in _sample_qc_ht.lms }) return residuals_ht.persist()
║ Preprocessing ║ ╚═══════════════╝ """ if phen_set == 'phesant': phen_tb_all1 = phen_tb_all.rename({'"' + phen + '"': 'phen'}) else: phen_tb_all1 = phen_tb_all.rename({phen: 'phen'}) phen_tb = phen_tb_all1.select(phen_tb_all1['phen']) mt1 = variants.annotate_cols( phen_str=hl.str(phen_tb[variants.s]['phen']).replace('\"', '')) mt1 = mt1.filter_cols(mt1.phen_str == '', keep=False) if phen_tb.phen.dtype == hl.dtype('bool'): mt1 = mt1.annotate_cols(phen=hl.bool(mt1.phen_str)).drop('phen_str') else: mt1 = mt1.annotate_cols(phen=hl.float64(mt1.phen_str)).drop('phen_str') for sex in ['female', 'male']: mt2 = mt1.filter_cols(mt1.isFemale == (sex == 'female')) mt3 = mt2.add_col_index() mt3 = mt3.rename({'dosage': 'x', 'phen': 'y'}) n_samples = mt3.count_cols() print('\n>>> phen ' + sex + ' ' + phen + ': N samples = ' + str(n_samples) + ' <<<') group_size = int( n_samples /
def get_residuals(phen, linreg): start = dt.datetime.now() path_f = wd + f'ukb31063.{phsource}.{phen}.residuals.female.reg3.tsv.bgz' path_m = wd + f'ukb31063.{phsource}.{phen}.residuals.male.reg3.tsv.bgz' try: subprocess.check_output([f'gsutil', 'ls', path_f]) != None subprocess.check_output([f'gsutil', 'ls', path_m]) != None print(f'\n#############\n{phen} already completed!\n#############\n') except: print(f'\n############\nStarting phenotype {phen}\n############\n') phen_tb = phen_tb_all.select(phen).join( cov, how='inner') #join phenotype and covariate table phen_tb = phen_tb.annotate(phen_str=hl.str(phen_tb[phen])) phen_tb = phen_tb.filter(phen_tb.phen_str == '', keep=False) if phen_tb[phen].dtype == hl.dtype('bool'): phen_tb = phen_tb.annotate( phen=hl.bool(phen_tb.phen_str.replace('\"', ''))) else: phen_tb = phen_tb.annotate( phen=hl.float64(phen_tb.phen_str.replace('\"', ''))) phen_betas = linreg[linreg.phen == phen][[ x for x in linreg.columns.values if 'beta' in x ]] betas = dict( zip(list(phen_betas.columns.values), phen_betas.values.tolist()[0])) fields = [ x.replace('beta_', '') for x in linreg.columns.values if 'beta_' in x ] phen_tb = phen_tb.annotate(intercept=1) phen_tb = phen_tb.annotate(y_hat=0) phen_tb_f = phen_tb.filter(phen_tb.sex == 1) #female phen_tb_m = phen_tb.filter(phen_tb.sex == 0) #male phen_tb_f = phen_tb_f.annotate( **{ f'y_hat': phen_tb_f.y_hat + phen_tb_f[f] * betas['beta_' + f] for f in fields }) phen_tb_m = phen_tb_m.annotate( **{ f'y_hat': phen_tb_m.y_hat + phen_tb_m[f] * betas['beta_' + f] for f in fields }) phen_tb_f = phen_tb_f.annotate(res_f=phen_tb_f.phen - phen_tb_f.y_hat) phen_tb_m = phen_tb_m.annotate(res_m=phen_tb_m.phen - phen_tb_m.y_hat) phen_tb_f.select(phen, 'y_hat', 'res_f').export(path_f) phen_tb_m.select(phen, 'y_hat', 'res_m').export(path_m) phen_tb_f = hl.import_table(path_f, impute=True) phen_tb_m = hl.import_table(path_m, impute=True) res_var_f = phen_tb_f.aggregate(hl.agg.stats( phen_tb_f.res_f)).stdev**2 if phen_tb_f.count() > 0 else float('nan') res_var_m = phen_tb_m.aggregate(hl.agg.stats( phen_tb_m.res_m)).stdev**2 if phen_tb_m.count() > 0 else float('nan') linreg.loc[linreg.phen == phen, 'resid_var_f'] = res_var_f linreg.loc[linreg.phen == phen, 'resid_var_m'] = res_var_m print( f'Variance of residual for {phen} females:\t{linreg[linreg.phen==phen].resid_var_f.values[0]}' ) print( f'Variance of residual for {phen} males:\t{linreg[linreg.phen==phen].resid_var_m.values[0]}' ) print( f'\n############\nIteration time for {phen}: {round((dt.datetime.now()-start).seconds/60, 2)} minutes\n############' ) return linreg
def get_phen_mt(variant_set, phen, batch, n_chunks, constant_sex_ratio, write): print('Starting Part 2: Splitting into n groups') print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now())) mt0 = hl.read_matrix_table('gs://nbaya/split/ukb31063.' + variant_set + '_variants.gwas_samples_repart.mt') if 'sim' in phen: print('\nReading simulated phenotype...') if variant_set == 'qc_pos': mt1 = hl.read_matrix_table( 'gs://nbaya/rg_sex/qc_pos.50_sim_inf_h2_0.485223.mt' ) #outdated elif variant_set == 'hm3': # mt1 = hl.read_matrix_table('gs://nbaya/rg_sex/50_sim_inf_h2_0.485223.mt') sim_phen = phen.split('_sim')[0] if sim_phen == '50': phen_tb = hl.read_table('gs://nbaya/ldscsim/' + variant_set + '.phen_' + sim_phen + '.sim_h2_' + str(0.485223) + '.ht') else: print('\nReading UKB phenotype...') # mt0 = hl.read_matrix_table('gs://nbaya/split/ukb31063.hm3_variants.gwas_samples_v2.mt') #old version if phen == '50_raw': phen_tb0 = hl.import_table('gs://nbaya/ukb31063.50_raw.tsv.bgz', missing='', impute=True, types={ 's': hl.tstr }).rename({phen: 'phen'}) elif phen == '50_raw_res': phen_tb0 = hl.read_table( 'gs://nbaya/split/50_raw_linreg.ht').rename({'res': 'phen'}) else: phen_tb0 = hl.import_table( 'gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz', missing='', impute=True, types={ '"userId"': hl.tstr }).rename({ '"userId"': 's', '"' + phen + '"': 'phen' }) phen_tb0 = phen_tb0.key_by('s') phen_tb = phen_tb0.select(phen_tb0['phen']) mt1 = mt0.annotate_cols( phen_str=hl.str(phen_tb[mt0.s]['phen']).replace('\"', '')) mt1 = mt1.filter_cols(mt1.phen_str == '', keep=False) if phen_tb.phen.dtype == hl.dtype('bool'): mt1 = mt1.annotate_cols(phen=hl.bool(mt1.phen_str)).drop('phen_str') else: mt1 = mt1.annotate_cols(phen=hl.float64(mt1.phen_str)).drop('phen_str') #Remove withdrawn samples withdrawn = hl.import_table('gs://nbaya/w31063_20181016.csv', missing='', no_header=True) withdrawn_set = set(withdrawn.f0.take(withdrawn.count())) mt1 = mt1.filter_cols(hl.literal(withdrawn_set).contains(mt1['s']), keep=False) mt1 = mt1.key_cols_by('s') if constant_sex_ratio: mt_ls = [ mt1.filter_cols(mt1.isFemale == 0), mt1.filter_cols(mt1.isFemale == 1) ] mt_final = [] for mt_temp in mt_ls: n_samples = mt_temp.count_cols() print( '\n>>> N samples = ' + str(n_samples) + ' <<<' ) #expect n samples to match n_non_missing from phenotypes.both_sexes.tsv, minus withdrawn samples. mt_temp2 = mt_temp.add_col_index() group_size = int( n_samples / n_chunks) + 1 #the ideal number of samples in each group #list of group ids to be paired to each sample (Note: length of group_ids > # of cols in mt, but it doesn't affect the result) group_ids = np.ndarray.tolist( np.ndarray.flatten(np.asarray([range(n_chunks)] * group_size))) group_ids = group_ids[0:n_samples] randstate = np.random.RandomState( int(batch)) #seed with batch number randstate.shuffle(group_ids) mt_final.append( mt_temp2.annotate_cols(group_id=hl.literal(group_ids)[hl.int32( mt_temp2.col_idx)])) #assign group ids # OLD VERSION mt3 = mt_final[0].union_cols(mt_final[1]) else: n_samples = mt1.count_cols() print( '\n>>> N samples = ' + str(n_samples) + ' <<<' ) #expect n samples to match n_non_missing from phenotypes.both_sexes.tsv, minus withdrawn samples. mt2 = mt1.add_col_index() group_size = int( n_samples / n_chunks) + 1 #the ideal number of samples in each group #list of group ids to be paired to each sample (Note: length of group_ids > # of cols in mt, but it doesn't affect the result) group_ids = np.ndarray.tolist( np.ndarray.flatten(np.asarray([range(n_chunks)] * group_size))) group_ids = group_ids[0:n_samples] randstate = np.random.RandomState(int(batch)) #seed with batch number randstate.shuffle(group_ids) mt3 = mt2.annotate_cols(group_id=hl.literal(group_ids)[hl.int32( mt2.col_idx)]) #assign group ids # OLD VERSION ht_group_ids = mt3.select_cols(mt3.group_id).cols() #assign group ids print( mt3.aggregate_cols( hl.agg.group_by(mt3.group_id, hl.agg.mean(mt3.isFemale)))) if write: print('Writing HailTable with group ids...') # mt3.write('gs://nbaya/split/ukb31063.'+variant_set+'_variants.gwas_samples_'+phen+'_grouped'+str(n_chunks)+'_batch_'+batch+'.mt',overwrite=True) #Takes ~30 min with 50 workers OLD VERSION ht_group_ids.write( f'gs://nbaya/split/ukb31063.{variant_set}_variants.gwas_samples_{phen}_grouped{n_chunks}_constantsexratio_{constant_sex_ratio}_batch_{batch}.ht', overwrite=True) print('Finished Part 2: Splitting into n groups') print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now()) ) #takes ~1h with 20 workers, 42 min with 30 workers return mt3
idx = range(start_idx,stop_idx,1) #chunks all phenotypes for phsource into parsplit number of chunks, then runs on the paridx-th chunk for i in idx: phen = phenlist[i] print('\n############') print(f'Running phenotype {phen} (phen idx {i+1})') print(f'iter {idx.index(i)+1} of {len(idx)} for parallel batch {paridx} of {parsplit}') print('############') starttime = datetime.datetime.now() phen_tb = phen_tb_all.select(phen).join(cov_tb, how='inner') #join phenotype and covariate table phen_tb = phen_tb.annotate(phen_str = hl.str(phen_tb[phen])) phen_tb = phen_tb.filter(phen_tb.phen_str == '',keep=False) if phen_tb[phen].dtype == hl.dtype('bool'): phen_tb = phen_tb.annotate(phen = hl.bool(phen_tb.phen_str.replace('\"',''))) else: phen_tb = phen_tb.annotate(phen = hl.float64(phen_tb.phen_str.replace('\"',''))) n = phen_tb.count() print(f'\n>>> Sample count for phenotype {phen}: {n} <<<') for cov_i, cov in enumerate(covs): cov = cov.copy() if cov_i+1 in models_to_run: #only run models in models_to_run if 'sex' not in cov or phen_tb.filter(phen_tb.isFemale == 1).count() % n != 0: #don't run regression if sex in cov AND trait is sex specific print(f'\n############\nRunning linreg model {cov_i+1} for phen {phen}\n############\n') if 'intercept' in cov: cov.remove('intercept') cov_list = [phen_tb[(x.replace('sex','isFemale') if 'sibs' not in x else x)] for x in cov] # change all terms with sex or cross terms with sex to isFemale, but ignore the sibling fields reg = phen_tb.aggregate(hl.agg.linreg(y=phen_tb.phen, x = [1]+cov_list))
def annotate_transcript_consequences(variants_path, transcripts_path, mane_transcripts_path=None): ds = hl.read_table(variants_path) most_severe_consequence = ds.vep.most_severe_consequence transcript_consequences = ds.vep.transcript_consequences # Drop irrelevant consequences transcript_consequences = transcript_consequences.map( lambda c: c.annotate(consequence_terms=c.consequence_terms.filter( lambda t: ~OMIT_CONSEQUENCE_TERMS.contains(t)))).filter( lambda c: c.consequence_terms.size() > 0) # Add/transmute derived fields transcript_consequences = transcript_consequences.map( lambda c: c.annotate(major_consequence=hl.sorted( c.consequence_terms, key=consequence_term_rank)[0]) ).map(lambda c: c.annotate( domains=c.domains.map(lambda domain: domain.db + ":" + domain.name), hgvsc=c.hgvsc.split(":")[-1], hgvsp=hgvsp_from_consequence_amino_acids(c), is_canonical=hl.bool(c.canonical), )) transcript_consequences = transcript_consequences.map(lambda c: c.select( "biotype", "consequence_terms", "domains", "gene_id", "gene_symbol", "hgvsc", "hgvsp", "is_canonical", "lof_filter", "lof_flags", "lof", "major_consequence", "polyphen_prediction", "sift_prediction", "transcript_id", )) transcripts = hl.read_table(transcripts_path) transcript_info = hl.dict([ (row.transcript_id, row.transcript_info) for row in transcripts.select(transcript_info=hl.struct( transcript_version=transcripts.transcript_version, gene_version=transcripts.gene.gene_version, )).collect() ]) transcript_consequences = transcript_consequences.map( lambda csq: csq.annotate(**transcript_info.get(csq.transcript_id))) if mane_transcripts_path: mane_transcripts = hl.read_table(mane_transcripts_path) mane_transcripts = hl.dict([(row.gene_id, row.drop("gene_id")) for row in mane_transcripts.collect()]) transcript_consequences = transcript_consequences.map( lambda csq: csq.annotate(**hl.rbind( mane_transcripts.get(csq.gene_id), lambda mane_transcript: (hl.case().when( (mane_transcript.ensembl_id == csq.transcript_id) & (mane_transcript.ensembl_version == csq. transcript_version), hl.struct( is_mane_select=True, is_mane_select_version=True, refseq_id=mane_transcript.refseq_id, refseq_version=mane_transcript.refseq_version, ), ).when( mane_transcript.ensembl_id == csq.transcript_id, hl.struct( is_mane_select=True, is_mane_select_version=False, refseq_id=hl.null(hl.tstr), refseq_version=hl.null(hl.tstr), ), ).default( hl.struct( is_mane_select=False, is_mane_select_version=False, refseq_id=hl.null(hl.tstr), refseq_version=hl.null(hl.tstr), ))), ))) transcript_consequences = hl.sorted( transcript_consequences, lambda c: ( hl.if_else( c.biotype == "protein_coding", 0, 1, missing_false=True), hl.if_else(c.major_consequence == most_severe_consequence, 0, 1, missing_false=True), hl.if_else(c.is_mane_select, 0, 1, missing_false=True), hl.if_else(c.is_canonical, 0, 1, missing_false=True), ), ) else: transcript_consequences = hl.sorted( transcript_consequences, lambda c: ( hl.if_else( c.biotype == "protein_coding", 0, 1, missing_false=True), hl.if_else(c.major_consequence == most_severe_consequence, 0, 1, missing_false=True), hl.if_else(c.is_canonical, 0, 1, missing_false=True), ), ) ds = ds.annotate( transcript_consequences=transcript_consequences).drop("vep") return ds
VarDP=hl.float64(mt.info.VarDP), AS_ReadPosRankSum=hl.float64(mt.info.AS_ReadPosRankSum), AS_pab_max=hl.float64(mt.info.AS_pab_max), AS_QD=hl.float64(mt.info.AS_QD), AS_MQ=hl.float64(mt.info.AS_MQ), QD=hl.float64(mt.info.QD), AS_MQRankSum=hl.float64(mt.info.AS_MQRankSum), FS=hl.float64(mt.info.FS), AS_FS=hl.float64(mt.info.AS_FS), ReadPosRankSum=hl.float64(mt.info.ReadPosRankSum), AS_QUALapprox=hl.float64(mt.info.AS_QUALapprox), AS_SB_TABLE=mt.info.AS_SB_TABLE.map(lambda x: hl.float64(x)), AS_VarDP=hl.float64(mt.info.AS_VarDP), AS_SOR=hl.float64(mt.info.AS_SOR), SOR=hl.float64(mt.info.SOR), singleton=hl.bool(mt.info.singleton), transmitted_singleton=hl.bool(mt.info.transmitted_singleton), omni=hl.bool(mt.info.omni), mills=hl.bool(mt.info.omni), monoallelic=hl.bool(mt.info.monoallelic), AS_VQSLOD=hl.float64(mt.info.AS_VQSLOD), InbreedingCoeff=hl.float64(mt.info.InbreedingCoeff))) # writing out a vcf version of the dataset for downstream analyses mt_vcf = mt_vcf.drop('gvcf_info') hl.export_vcf(mt_vcf, 'gs://african-seq-data/hgdp_tgp/hgdp_tgp_postqc.vcf.bgz', parallel='separate_header') # Subsetting the variants in the dataset to only PASS variants (those which passed variant QC)