Ejemplo n.º 1
0
def get_group_to_counts_expr(k: hl.expr.StructExpression, counts: hl.expr.DictExpression) -> hl.expr.ArrayExpression:
    return hl.range(1, k.snv - 1, step=-1).flatmap(
        lambda snv: hl.range(0, k.all + 1).flatmap(
            lambda af: hl.range(0, k.csq + 1).map(
                lambda csq: hl.struct(snv=hl.bool(snv), all=hl.bool(af), csq=csq)
            )
        )
    ).filter(
        lambda key: counts.contains(key)
    ).map(
        lambda key: counts[key]
    )
Ejemplo n.º 2
0
def annotate_phen(tb, phen, sex, phen_tb_dict, filter_to_phen=True):
    r'''
    Annotates `tb` with phenotype `phen` and filters to individuals with 
    phenotype defined. Uses sex-specific IRNT phenotypes.
    sex options: female, male, both_sexes
    '''
    print(
        f'\n... Reading UKB phenotype "{phen_dict[phen][0]}" for {sex} (code: {phen}) ...'
    )

    phen_tb0 = phen_tb_dict[sex]
    phen_tb = phen_tb0.select(phen).rename({phen: 'phen'})

    if type(tb) == hl.table.Table:
        annotate_fn = hl.Table.annotate
        filter_fn = hl.Table.filter
    elif type(tb) == hl.matrixtable.MatrixTable:
        annotate_fn = hl.MatrixTable.annotate_cols
        filter_fn = hl.MatrixTable.filter_cols

    tb0 = annotate_fn(self=tb,
                      phen_str=hl.str(phen_tb[tb.s]['phen']).replace('\"', ''))

    if filter_to_phen:  # filter to individuals with phenotype data defined
        tb1 = filter_fn(self=tb0, expr=tb0.phen_str == '', keep=False)

    if phen_tb.phen.dtype == hl.dtype('bool'):
        tb2 = annotate_fn(self=tb1,
                          phen=hl.bool(tb1.phen_str)).drop('phen_str')
    else:
        tb2 = annotate_fn(self=tb1,
                          phen=hl.float64(tb1.phen_str)).drop('phen_str')

    return tb2
Ejemplo n.º 3
0
def compute_qc_metrics_residuals(
    ht: hl.Table,
    pc_scores: hl.expr.ArrayNumericExpression,
    qc_metrics: Dict[str, hl.expr.NumericExpression],
    use_pc_square: bool = True,
    n_pcs: Optional[int] = None,
    regression_sample_inclusion_expr: hl.expr.BooleanExpression = hl.bool(
        True),
) -> hl.Table:
    """
    Compute QC metrics residuals after regressing out PCs (and optionally PC^2).

    .. note::

        The `regression_sample_inclusion_expr` can be used to select a subset of the samples to include in the regression calculation.
        Residuals are always computed for all samples.

    :param ht: Input sample QC metrics HT
    :param pc_scores: The expression in the input HT that stores the PC scores
    :param qc_metrics: A dictionary with the name of each QC metric to compute residuals for and their corresponding expression in the input HT.
    :param use_pc_square: Whether to  use PC^2 in the regression or not
    :param n_pcs: Numer of PCs to use. If not set, then all PCs in `pc_scores` are used.
    :param regression_sample_inclusion_expr: An optional expression to select samples to include in the regression calculation.
    :return: Table with QC metrics residuals
    """
    # Annotate QC HT with fields necessary for computation
    _sample_qc_ht = ht.select(**qc_metrics,
                              scores=pc_scores,
                              _keep=regression_sample_inclusion_expr)

    # If n_pcs wasn't provided, use all PCs
    if n_pcs is None:
        n_pcs = _sample_qc_ht.aggregate(
            hl.agg.min(hl.len(_sample_qc_ht.scores)))

    logger.info(
        "Computing regressed QC metrics filters using %d PCs for metrics: %s",
        n_pcs,
        ", ".join(qc_metrics),
    )

    # Prepare regression variables, adding 1.0 first for the intercept
    # Adds square of variables if use_pc_square is true
    x_expr = [1.0] + [_sample_qc_ht.scores[i] for i in range(0, n_pcs)]
    if use_pc_square:
        x_expr.extend([
            _sample_qc_ht.scores[i] * _sample_qc_ht.scores[i]
            for i in range(0, n_pcs)
        ])

    # Compute linear regressions
    lms = _sample_qc_ht.aggregate(
        hl.struct(
            **{
                metric: hl.agg.filter(
                    _sample_qc_ht._keep,
                    hl.agg.linreg(y=_sample_qc_ht[metric], x=x_expr),
                )
                for metric in qc_metrics
            }))

    _sample_qc_ht = _sample_qc_ht.annotate_globals(lms=lms).persist()

    # Compute residuals
    def get_lm_prediction_expr(metric: str):
        lm_pred_expr = _sample_qc_ht.lms[metric].beta[0] + hl.sum(
            hl.range(n_pcs).map(lambda i: _sample_qc_ht.lms[metric].beta[i + 1]
                                * _sample_qc_ht.scores[i]))
        if use_pc_square:
            lm_pred_expr = lm_pred_expr + hl.sum(
                hl.range(n_pcs).map(
                    lambda i: _sample_qc_ht.lms[metric].beta[i + n_pcs + 1] *
                    _sample_qc_ht.scores[i] * _sample_qc_ht.scores[i]))
        return lm_pred_expr

    residuals_ht = _sample_qc_ht.select(
        **{
            f"{metric}_residual": _sample_qc_ht[metric] -
            get_lm_prediction_expr(metric)
            for metric in _sample_qc_ht.lms
        })

    return residuals_ht.persist()
Ejemplo n.º 4
0
    ║ Preprocessing ║
    ╚═══════════════╝
    """

    if phen_set == 'phesant':
        phen_tb_all1 = phen_tb_all.rename({'"' + phen + '"': 'phen'})
    else:
        phen_tb_all1 = phen_tb_all.rename({phen: 'phen'})
    phen_tb = phen_tb_all1.select(phen_tb_all1['phen'])

    mt1 = variants.annotate_cols(
        phen_str=hl.str(phen_tb[variants.s]['phen']).replace('\"', ''))
    mt1 = mt1.filter_cols(mt1.phen_str == '', keep=False)

    if phen_tb.phen.dtype == hl.dtype('bool'):
        mt1 = mt1.annotate_cols(phen=hl.bool(mt1.phen_str)).drop('phen_str')
    else:
        mt1 = mt1.annotate_cols(phen=hl.float64(mt1.phen_str)).drop('phen_str')

    for sex in ['female', 'male']:
        mt2 = mt1.filter_cols(mt1.isFemale == (sex == 'female'))

        mt3 = mt2.add_col_index()
        mt3 = mt3.rename({'dosage': 'x', 'phen': 'y'})

        n_samples = mt3.count_cols()
        print('\n>>> phen ' + sex + ' ' + phen + ': N samples = ' +
              str(n_samples) + ' <<<')

        group_size = int(
            n_samples /
Ejemplo n.º 5
0
def get_residuals(phen, linreg):
    start = dt.datetime.now()
    path_f = wd + f'ukb31063.{phsource}.{phen}.residuals.female.reg3.tsv.bgz'
    path_m = wd + f'ukb31063.{phsource}.{phen}.residuals.male.reg3.tsv.bgz'
    try:
        subprocess.check_output([f'gsutil', 'ls', path_f]) != None
        subprocess.check_output([f'gsutil', 'ls', path_m]) != None
        print(f'\n#############\n{phen} already completed!\n#############\n')
    except:
        print(f'\n############\nStarting phenotype {phen}\n############\n')
        phen_tb = phen_tb_all.select(phen).join(
            cov, how='inner')  #join phenotype and covariate table
        phen_tb = phen_tb.annotate(phen_str=hl.str(phen_tb[phen]))
        phen_tb = phen_tb.filter(phen_tb.phen_str == '', keep=False)
        if phen_tb[phen].dtype == hl.dtype('bool'):
            phen_tb = phen_tb.annotate(
                phen=hl.bool(phen_tb.phen_str.replace('\"', '')))
        else:
            phen_tb = phen_tb.annotate(
                phen=hl.float64(phen_tb.phen_str.replace('\"', '')))

        phen_betas = linreg[linreg.phen == phen][[
            x for x in linreg.columns.values if 'beta' in x
        ]]
        betas = dict(
            zip(list(phen_betas.columns.values),
                phen_betas.values.tolist()[0]))
        fields = [
            x.replace('beta_', '') for x in linreg.columns.values
            if 'beta_' in x
        ]
        phen_tb = phen_tb.annotate(intercept=1)
        phen_tb = phen_tb.annotate(y_hat=0)
        phen_tb_f = phen_tb.filter(phen_tb.sex == 1)  #female
        phen_tb_m = phen_tb.filter(phen_tb.sex == 0)  #male
        phen_tb_f = phen_tb_f.annotate(
            **{
                f'y_hat': phen_tb_f.y_hat + phen_tb_f[f] * betas['beta_' + f]
                for f in fields
            })
        phen_tb_m = phen_tb_m.annotate(
            **{
                f'y_hat': phen_tb_m.y_hat + phen_tb_m[f] * betas['beta_' + f]
                for f in fields
            })
        phen_tb_f = phen_tb_f.annotate(res_f=phen_tb_f.phen - phen_tb_f.y_hat)
        phen_tb_m = phen_tb_m.annotate(res_m=phen_tb_m.phen - phen_tb_m.y_hat)
        phen_tb_f.select(phen, 'y_hat', 'res_f').export(path_f)
        phen_tb_m.select(phen, 'y_hat', 'res_m').export(path_m)
    phen_tb_f = hl.import_table(path_f, impute=True)
    phen_tb_m = hl.import_table(path_m, impute=True)
    res_var_f = phen_tb_f.aggregate(hl.agg.stats(
        phen_tb_f.res_f)).stdev**2 if phen_tb_f.count() > 0 else float('nan')
    res_var_m = phen_tb_m.aggregate(hl.agg.stats(
        phen_tb_m.res_m)).stdev**2 if phen_tb_m.count() > 0 else float('nan')
    linreg.loc[linreg.phen == phen, 'resid_var_f'] = res_var_f
    linreg.loc[linreg.phen == phen, 'resid_var_m'] = res_var_m
    print(
        f'Variance of residual for {phen} females:\t{linreg[linreg.phen==phen].resid_var_f.values[0]}'
    )
    print(
        f'Variance of residual for {phen} males:\t{linreg[linreg.phen==phen].resid_var_m.values[0]}'
    )
    print(
        f'\n############\nIteration time for {phen}: {round((dt.datetime.now()-start).seconds/60, 2)} minutes\n############'
    )
    return linreg
Ejemplo n.º 6
0
def get_phen_mt(variant_set, phen, batch, n_chunks, constant_sex_ratio, write):
    print('Starting Part 2: Splitting into n groups')
    print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now()))

    mt0 = hl.read_matrix_table('gs://nbaya/split/ukb31063.' + variant_set +
                               '_variants.gwas_samples_repart.mt')

    if 'sim' in phen:
        print('\nReading simulated phenotype...')
        if variant_set == 'qc_pos':
            mt1 = hl.read_matrix_table(
                'gs://nbaya/rg_sex/qc_pos.50_sim_inf_h2_0.485223.mt'
            )  #outdated
        elif variant_set == 'hm3':
            #            mt1 = hl.read_matrix_table('gs://nbaya/rg_sex/50_sim_inf_h2_0.485223.mt')
            sim_phen = phen.split('_sim')[0]
            if sim_phen == '50':
                phen_tb = hl.read_table('gs://nbaya/ldscsim/' + variant_set +
                                        '.phen_' + sim_phen + '.sim_h2_' +
                                        str(0.485223) + '.ht')

    else:
        print('\nReading UKB phenotype...')
        #        mt0 = hl.read_matrix_table('gs://nbaya/split/ukb31063.hm3_variants.gwas_samples_v2.mt') #old version

        if phen == '50_raw':
            phen_tb0 = hl.import_table('gs://nbaya/ukb31063.50_raw.tsv.bgz',
                                       missing='',
                                       impute=True,
                                       types={
                                           's': hl.tstr
                                       }).rename({phen: 'phen'})
        elif phen == '50_raw_res':
            phen_tb0 = hl.read_table(
                'gs://nbaya/split/50_raw_linreg.ht').rename({'res': 'phen'})
        else:
            phen_tb0 = hl.import_table(
                'gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz',
                missing='',
                impute=True,
                types={
                    '"userId"': hl.tstr
                }).rename({
                    '"userId"': 's',
                    '"' + phen + '"': 'phen'
                })
            phen_tb0 = phen_tb0.key_by('s')

        phen_tb = phen_tb0.select(phen_tb0['phen'])

    mt1 = mt0.annotate_cols(
        phen_str=hl.str(phen_tb[mt0.s]['phen']).replace('\"', ''))
    mt1 = mt1.filter_cols(mt1.phen_str == '', keep=False)

    if phen_tb.phen.dtype == hl.dtype('bool'):
        mt1 = mt1.annotate_cols(phen=hl.bool(mt1.phen_str)).drop('phen_str')
    else:
        mt1 = mt1.annotate_cols(phen=hl.float64(mt1.phen_str)).drop('phen_str')

    #Remove withdrawn samples
    withdrawn = hl.import_table('gs://nbaya/w31063_20181016.csv',
                                missing='',
                                no_header=True)
    withdrawn_set = set(withdrawn.f0.take(withdrawn.count()))
    mt1 = mt1.filter_cols(hl.literal(withdrawn_set).contains(mt1['s']),
                          keep=False)
    mt1 = mt1.key_cols_by('s')

    if constant_sex_ratio:
        mt_ls = [
            mt1.filter_cols(mt1.isFemale == 0),
            mt1.filter_cols(mt1.isFemale == 1)
        ]
        mt_final = []
        for mt_temp in mt_ls:
            n_samples = mt_temp.count_cols()
            print(
                '\n>>> N samples = ' + str(n_samples) + ' <<<'
            )  #expect n samples to match n_non_missing from phenotypes.both_sexes.tsv, minus withdrawn samples.

            mt_temp2 = mt_temp.add_col_index()
            group_size = int(
                n_samples /
                n_chunks) + 1  #the ideal number of samples in each group
            #list of group ids to be paired to each sample (Note: length of group_ids > # of cols in mt, but it doesn't affect the result)
            group_ids = np.ndarray.tolist(
                np.ndarray.flatten(np.asarray([range(n_chunks)] * group_size)))
            group_ids = group_ids[0:n_samples]
            randstate = np.random.RandomState(
                int(batch))  #seed with batch number
            randstate.shuffle(group_ids)
            mt_final.append(
                mt_temp2.annotate_cols(group_id=hl.literal(group_ids)[hl.int32(
                    mt_temp2.col_idx)]))  #assign group ids # OLD VERSION

        mt3 = mt_final[0].union_cols(mt_final[1])
    else:
        n_samples = mt1.count_cols()
        print(
            '\n>>> N samples = ' + str(n_samples) + ' <<<'
        )  #expect n samples to match n_non_missing from phenotypes.both_sexes.tsv, minus withdrawn samples.

        mt2 = mt1.add_col_index()
        group_size = int(
            n_samples /
            n_chunks) + 1  #the ideal number of samples in each group
        #list of group ids to be paired to each sample (Note: length of group_ids > # of cols in mt, but it doesn't affect the result)
        group_ids = np.ndarray.tolist(
            np.ndarray.flatten(np.asarray([range(n_chunks)] * group_size)))
        group_ids = group_ids[0:n_samples]
        randstate = np.random.RandomState(int(batch))  #seed with batch number
        randstate.shuffle(group_ids)
        mt3 = mt2.annotate_cols(group_id=hl.literal(group_ids)[hl.int32(
            mt2.col_idx)])  #assign group ids # OLD VERSION

    ht_group_ids = mt3.select_cols(mt3.group_id).cols()  #assign group ids

    print(
        mt3.aggregate_cols(
            hl.agg.group_by(mt3.group_id, hl.agg.mean(mt3.isFemale))))

    if write:
        print('Writing HailTable with group ids...')
        #    mt3.write('gs://nbaya/split/ukb31063.'+variant_set+'_variants.gwas_samples_'+phen+'_grouped'+str(n_chunks)+'_batch_'+batch+'.mt',overwrite=True) #Takes ~30 min with 50 workers OLD VERSION
        ht_group_ids.write(
            f'gs://nbaya/split/ukb31063.{variant_set}_variants.gwas_samples_{phen}_grouped{n_chunks}_constantsexratio_{constant_sex_ratio}_batch_{batch}.ht',
            overwrite=True)

    print('Finished Part 2: Splitting into n groups')
    print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now())
          )  #takes ~1h with 20 workers, 42 min with 30 workers

    return mt3
Ejemplo n.º 7
0
idx = range(start_idx,stop_idx,1) #chunks all phenotypes for phsource into parsplit number of chunks, then runs on the paridx-th chunk

for i in idx:
    phen = phenlist[i]
    
    print('\n############')
    print(f'Running phenotype {phen} (phen idx {i+1})')
    print(f'iter {idx.index(i)+1} of {len(idx)} for parallel batch {paridx} of {parsplit}')
    print('############')
    starttime = datetime.datetime.now()
    
    phen_tb = phen_tb_all.select(phen).join(cov_tb, how='inner') #join phenotype and covariate table
    phen_tb = phen_tb.annotate(phen_str = hl.str(phen_tb[phen]))
    phen_tb = phen_tb.filter(phen_tb.phen_str == '',keep=False)
    if phen_tb[phen].dtype == hl.dtype('bool'):
        phen_tb = phen_tb.annotate(phen = hl.bool(phen_tb.phen_str.replace('\"','')))
    else:
        phen_tb = phen_tb.annotate(phen = hl.float64(phen_tb.phen_str.replace('\"','')))
            
    n = phen_tb.count()
    print(f'\n>>> Sample count for phenotype {phen}: {n} <<<')
    
    for cov_i, cov in enumerate(covs):
        cov = cov.copy()
        if cov_i+1 in models_to_run: #only run models in models_to_run
            if 'sex' not in cov or phen_tb.filter(phen_tb.isFemale == 1).count() % n != 0: #don't run regression if sex in cov AND trait is sex specific
                print(f'\n############\nRunning linreg model {cov_i+1} for phen {phen}\n############\n')
                if 'intercept' in cov:
                    cov.remove('intercept')
                cov_list = [phen_tb[(x.replace('sex','isFemale') if 'sibs' not in x else x)] for x in cov] # change all terms with sex or cross terms with sex to isFemale, but ignore the sibling fields
                reg = phen_tb.aggregate(hl.agg.linreg(y=phen_tb.phen, x = [1]+cov_list))
Ejemplo n.º 8
0
def annotate_transcript_consequences(variants_path,
                                     transcripts_path,
                                     mane_transcripts_path=None):
    ds = hl.read_table(variants_path)

    most_severe_consequence = ds.vep.most_severe_consequence

    transcript_consequences = ds.vep.transcript_consequences

    # Drop irrelevant consequences
    transcript_consequences = transcript_consequences.map(
        lambda c: c.annotate(consequence_terms=c.consequence_terms.filter(
            lambda t: ~OMIT_CONSEQUENCE_TERMS.contains(t)))).filter(
                lambda c: c.consequence_terms.size() > 0)

    # Add/transmute derived fields
    transcript_consequences = transcript_consequences.map(
        lambda c: c.annotate(major_consequence=hl.sorted(
            c.consequence_terms, key=consequence_term_rank)[0])
    ).map(lambda c: c.annotate(
        domains=c.domains.map(lambda domain: domain.db + ":" + domain.name),
        hgvsc=c.hgvsc.split(":")[-1],
        hgvsp=hgvsp_from_consequence_amino_acids(c),
        is_canonical=hl.bool(c.canonical),
    ))

    transcript_consequences = transcript_consequences.map(lambda c: c.select(
        "biotype",
        "consequence_terms",
        "domains",
        "gene_id",
        "gene_symbol",
        "hgvsc",
        "hgvsp",
        "is_canonical",
        "lof_filter",
        "lof_flags",
        "lof",
        "major_consequence",
        "polyphen_prediction",
        "sift_prediction",
        "transcript_id",
    ))

    transcripts = hl.read_table(transcripts_path)

    transcript_info = hl.dict([
        (row.transcript_id, row.transcript_info)
        for row in transcripts.select(transcript_info=hl.struct(
            transcript_version=transcripts.transcript_version,
            gene_version=transcripts.gene.gene_version,
        )).collect()
    ])

    transcript_consequences = transcript_consequences.map(
        lambda csq: csq.annotate(**transcript_info.get(csq.transcript_id)))

    if mane_transcripts_path:
        mane_transcripts = hl.read_table(mane_transcripts_path)

        mane_transcripts = hl.dict([(row.gene_id, row.drop("gene_id"))
                                    for row in mane_transcripts.collect()])

        transcript_consequences = transcript_consequences.map(
            lambda csq: csq.annotate(**hl.rbind(
                mane_transcripts.get(csq.gene_id),
                lambda mane_transcript: (hl.case().when(
                    (mane_transcript.ensembl_id == csq.transcript_id)
                    & (mane_transcript.ensembl_version == csq.
                       transcript_version),
                    hl.struct(
                        is_mane_select=True,
                        is_mane_select_version=True,
                        refseq_id=mane_transcript.refseq_id,
                        refseq_version=mane_transcript.refseq_version,
                    ),
                ).when(
                    mane_transcript.ensembl_id == csq.transcript_id,
                    hl.struct(
                        is_mane_select=True,
                        is_mane_select_version=False,
                        refseq_id=hl.null(hl.tstr),
                        refseq_version=hl.null(hl.tstr),
                    ),
                ).default(
                    hl.struct(
                        is_mane_select=False,
                        is_mane_select_version=False,
                        refseq_id=hl.null(hl.tstr),
                        refseq_version=hl.null(hl.tstr),
                    ))),
            )))

        transcript_consequences = hl.sorted(
            transcript_consequences,
            lambda c: (
                hl.if_else(
                    c.biotype == "protein_coding", 0, 1, missing_false=True),
                hl.if_else(c.major_consequence == most_severe_consequence,
                           0,
                           1,
                           missing_false=True),
                hl.if_else(c.is_mane_select, 0, 1, missing_false=True),
                hl.if_else(c.is_canonical, 0, 1, missing_false=True),
            ),
        )

    else:
        transcript_consequences = hl.sorted(
            transcript_consequences,
            lambda c: (
                hl.if_else(
                    c.biotype == "protein_coding", 0, 1, missing_false=True),
                hl.if_else(c.major_consequence == most_severe_consequence,
                           0,
                           1,
                           missing_false=True),
                hl.if_else(c.is_canonical, 0, 1, missing_false=True),
            ),
        )

    ds = ds.annotate(
        transcript_consequences=transcript_consequences).drop("vep")

    return ds
Ejemplo n.º 9
0
    VarDP=hl.float64(mt.info.VarDP),
    AS_ReadPosRankSum=hl.float64(mt.info.AS_ReadPosRankSum),
    AS_pab_max=hl.float64(mt.info.AS_pab_max),
    AS_QD=hl.float64(mt.info.AS_QD),
    AS_MQ=hl.float64(mt.info.AS_MQ),
    QD=hl.float64(mt.info.QD),
    AS_MQRankSum=hl.float64(mt.info.AS_MQRankSum),
    FS=hl.float64(mt.info.FS),
    AS_FS=hl.float64(mt.info.AS_FS),
    ReadPosRankSum=hl.float64(mt.info.ReadPosRankSum),
    AS_QUALapprox=hl.float64(mt.info.AS_QUALapprox),
    AS_SB_TABLE=mt.info.AS_SB_TABLE.map(lambda x: hl.float64(x)),
    AS_VarDP=hl.float64(mt.info.AS_VarDP),
    AS_SOR=hl.float64(mt.info.AS_SOR),
    SOR=hl.float64(mt.info.SOR),
    singleton=hl.bool(mt.info.singleton),
    transmitted_singleton=hl.bool(mt.info.transmitted_singleton),
    omni=hl.bool(mt.info.omni),
    mills=hl.bool(mt.info.omni),
    monoallelic=hl.bool(mt.info.monoallelic),
    AS_VQSLOD=hl.float64(mt.info.AS_VQSLOD),
    InbreedingCoeff=hl.float64(mt.info.InbreedingCoeff)))

# writing out a vcf version of the dataset for downstream analyses
mt_vcf = mt_vcf.drop('gvcf_info')

hl.export_vcf(mt_vcf,
              'gs://african-seq-data/hgdp_tgp/hgdp_tgp_postqc.vcf.bgz',
              parallel='separate_header')

# Subsetting the variants in the dataset to only PASS variants (those which passed variant QC)