Beispiel #1
0
def get_veploader_and_regions(filter_vids):

    '''
    get the ensembl-vep loader and gene regions
    '''

    if snakemake.params.effect == 'LOF':

        ensemblvepdf = pd.read_csv(snakemake.input.vep_tsv, sep='\t', usecols=['Uploaded_variation', 'Location', 'Gene'], index_col='Uploaded_variation')

        keep = intersect_ids(filter_vids, ensemblvepdf.index.values)
        ensemblvepdf = ensemblvepdf.loc[keep]
        ensemblvepdf.reset_index(inplace=True)

        eveploader = EnsemblVEPLoader(ensemblvepdf['Uploaded_variation'], ensemblvepdf['Location'], ensemblvepdf['Gene'])

    elif snakemake.params.effect == 'missense':

        ensemblvepdf = pd.read_csv(snakemake.input.vep_tsv, sep='\t', usecols=['Uploaded_variation', 'Location', 'Gene', 'pos_standardized', 'impact'], index_col='Uploaded_variation')

        keep = intersect_ids(filter_vids, ensemblvepdf.index.values)
        ensemblvepdf = ensemblvepdf.loc[keep]
        ensemblvepdf.reset_index(inplace=True)

        # filter by impact
        # since we can't weigh the variants. We use a different way to filter them than in the other tests:
        
        # NOT:
        # ensemblvepdf = ensemblvepdf[ensemblvepdf.groupby(['Gene', 'pos_standardized'])['impact'].transform(np.max) >= snakemake.params.min_impact]
        
        ensemblvepdf = ensemblvepdf[ ensemblvepdf.impact >= snakemake.params.min_impact ]
        eveploader = EnsemblVEPLoader(ensemblvepdf['Uploaded_variation'], ensemblvepdf['Location'], ensemblvepdf['Gene'], data=ensemblvepdf[['impact']].values)

    else:
        raise NotImplementedError('effect has to be either missense or LOF!')

    regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0,1,2,3], dtype={0:str, 1: np.int32, 2:np.int32, 3:str})
    regions.columns = ['chrom', 'start', 'end', 'name']

    # discard all genes that are not on the chromosomes we are looking at:
    # chromosomes = np.unique(eveploader.pos_df.chrom)
    # regions = regions[regions.chrom.str.isin(chromosomes)]

    # discard all genes for which we don't have annotations
    regions['gene'] = regions.name.str.split('_', expand=True)[0]
    regions.set_index('gene', inplace=True)

    genes = intersect_ids(np.unique(regions.index.values), np.unique(eveploader.pos_df.gene))
    regions = regions.loc[genes].reset_index()

    regions = regions.sort_values(['chrom','start','end'])[['chrom','start','end','name','gene']]

    return eveploader, regions
def sid_filter(vids):
    
    if 'sid_include' in snakemake.config:
        print('limiting to variants present in {}'.format(snakemake.config['sid_include']))
        
        infilepath = snakemake.config['sid_include']
        
        if infilepath.endswith('gz'):
            with gzip.open(infilepath,'rt') as infile:
                sid = np.array([l.rstrip() for l in infile])
        else:
            with open(infilepath, 'r') as infile:
                sid = np.array([l.rstrip() for l in infile])
    else:
        return vids
                
    return intersect_ids(vids, sid)
        if chromosome not in ['chr9', 'chr16', 'chr21']:
            continue
    
    # set up the ensembl vep loader for the chromosome
    ensemblvepdf = pd.read_csv(vep_tsv,
                               sep='\t',
                               usecols=['Uploaded_variation', 'Location', 'Gene', 'pos_standardized', 'impact'],
                               index_col='Uploaded_variation')
    
    # get set of variants for the chromosome:
    mac_report = maf_filter(mac_report)
    filter_vids = mac_report.index.values
    filter_vids = sid_filter(filter_vids)

    # filter by MAF
    keep = intersect_ids(filter_vids, ensemblvepdf.index.values)
    ensemblvepdf = ensemblvepdf.loc[keep]
    ensemblvepdf.reset_index(inplace=True)

    # filter by impact:
    ensemblvepdf = ensemblvepdf[ensemblvepdf.groupby(['Gene','pos_standardized'])['impact'].transform(np.max) >= snakemake.params.min_impact ]

    # initialize the loader
    eveploader = EnsemblVEPLoader(ensemblvepdf['Uploaded_variation'], ensemblvepdf['Location'], ensemblvepdf['Gene'], data = ensemblvepdf[['pos_standardized','impact']].values)

    # set up the regions to loop over for the chromosome
    regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0,1,2,3], dtype={0:str, 1: np.int32, 2:np.int32, 3:str})
    regions.columns = ['chrom', 'start', 'end', 'name']

    # discard all genes for which we don't have annotations
    regions['gene'] = regions.name.str.split('_', expand=True)[0]
Beispiel #4
0
    spliceaidf = pd.read_csv(vep_tsv,
                             sep='\t',
                             usecols=[
                                 'name', 'chrom', 'end', 'gene', 'max_effect',
                                 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG',
                                 'DP_AL', 'DP_DG', 'DP_DL'
                             ],
                             index_col='name')

    # get set of variants for the chromosome:
    mac_report = maf_filter(mac_report)
    filter_vids = mac_report.index.values
    filter_vids = sid_filter(filter_vids)

    # filter by MAF
    keep = intersect_ids(filter_vids, spliceaidf.index.values)
    spliceaidf = spliceaidf.loc[keep]
    spliceaidf.reset_index(inplace=True)

    # filter by impact:
    spliceaidf = spliceaidf[
        spliceaidf.max_effect >= snakemake.params.min_impact]

    # set up the regions to loop over for the chromosome
    regions = regions_all.copy()

    # discard all genes for which we don't have annotations
    gene_ids = regions.name.str.split(
        '_', expand=True)  # table with two columns, ensembl-id and gene-name
    regions['gene'] = gene_ids[1]  # this is the gene name
    regions['ensembl_id'] = gene_ids[0]
Beispiel #5
0
def main():

    # get variants that pass MAF and genotyping filters
    filter_vids = maf_filter()
    filter_vids = sid_filter(filter_vids)

    # get the variant effect predictions and gene regions:
    eveploader, regions = get_veploader_and_regions(filter_vids)

    # get the genotype loader:
    plinkloader = VariantLoaderSnpReader(Bed(snakemake.input.genotypes_bed, count_A1=True))

    # intersect variants
    common_vids = intersect_ids(plinkloader.get_vids(), eveploader.get_vids())

    plinkloader.update_variants(common_vids)
    eveploader.update_variants(common_vids)

    # drop irrelevant indidivuals
    iids = iid_filter()
    plinkloader.update_individuals(iids)

    # batch size to write genotypes
    batch_size = 100

    def load_and_proc_geno(interval):

        try:
            V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[0])
        except KeyError:
            raise GotNone

        if V1.index.empty:
            raise GotNone

        vids = V1.index.get_level_values('vid')

        temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False)
        
        temp_genotypes = np.ma.masked_invalid(temp_genotypes).filled(0.) # since we have already kicked out all the "weird" variants, we can skip the pre-processing below

        #G1, vids = plinkloader.preprocess_genotypes(temp_genotypes,
        #                                              temp_vids,
        #                                              recode_maf=False,
        #                                              invert_encoding=False,
        #                                              impute_mean=True,
        #                                              center=True,
        #                                              max_maf=snakemake.params.max_maf)  # this will kick out any where major/minor are "flipped"

        #if G1 is None:
        #    raise GotNone

        G1_burden = (np.sum(temp_genotypes > 0.5, axis=1, keepdims=True) > 0.).astype('i1')

        return G1_burden, vids

    genos = []
    gene_ids = []

    # initialize output file
    initialize_h5(snakemake.output.h5, len(regions), len(iids))

    # the actual loop that glues it all together
    regions = regions.iterrows()

    h5 = h5py.File(snakemake.output.h5, 'r+')
    out = h5['G']
    
    idfile = open(snakemake.output.gene_txt, 'x')

    i = 0 # keeps track of how many entries have been exported to the hdf5 file
    ibatch = 0
    
    try:
        while True:

            b = []
            ids = []

            while ibatch < batch_size:

                _, region = next(regions)

                try:
                    G, vids = load_and_proc_geno(region)
                except GotNone:
                    continue

                b.append(G)
                ids.append(region['name'])

                ibatch += 1

            out[i:(i+ibatch)] = np.concatenate(b, axis = 1).T
            write_ids(ids, idfile)

            i += len(ids)
            ibatch = 0

    except StopIteration:

        if len(b) > 0:

            out[i:(i+ibatch)] = np.concatenate(b, axis = 1).T
            write_ids(ids, idfile)
            i += len(ids)

        idfile.close()
        out.resize(i, axis=0)
        h5.close()
    
    relpath_out = os.path.relpath(snakemake.input.complete_cases, os.path.dirname(snakemake.output.iid_txt) )
    
    os.symlink(relpath_out, snakemake.output.iid_txt)
    collapser = LocalCollapsing(distance_threshold=51.)

    for strand in ['plus', 'minus']:

        # set up the regions to loop over for the chromosome
        chromosome_id = chromosome.replace('chr', '')

        regions = regions_all[(regions_all.chrom == chromosome_id)
                              & (regions_all.strand == strand)]

        # get variants that pass variant effect prediction threshold:
        vep_vids, vep_mask = vep_filter(vep_h5[chromosome][strand],
                                        vep_bed[chromosome][strand])

        # combine
        filter_vids_chromosome = intersect_ids(vep_vids, filter_vids)

        # initialize the vep loader
        veploader = Hdf5Loader(vep_bed[chromosome][strand],
                               vep_h5[chromosome][strand],
                               'diffscore',
                               from_janggu=True)
        veploader.update_variants(filter_vids_chromosome)
        veploader.set_mask(vep_mask)

        # set up the variant loader (rbp variants) for the chromosome + strand
        plinkloader = VariantLoaderSnpReader(
            Bed(bed, count_A1=True, num_threads=4))
        plinkloader.update_variants(veploader.get_vids())
        plinkloader.update_individuals(covariatesloader.get_iids())
Beispiel #7
0
# storing all results here
results = []

i_gene = 0
i_chrom = 0

# conditional analysis:
# these genotypes will be used for the conditional analysis
geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1))
geno_cond.update_individuals(covariatesloader.get_iids())

# this file contains the mapping of associations to SNPs to condition on
conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0)
conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates()

geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids()))
logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids())))

# enter the chromosome loop:
timer = Timer()
for i, (chromosome, bed, vep_tsv, ensembl_vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep):

    if chromosome.replace('chr','') not in regions_all.chrom.unique():
        continue

    if snakemake.params.debug:
        # process only two chromosomes if debugging...
        if i_chrom > 2:
            break

    timer.reset()
Beispiel #8
0
def test_full_rank_continuous():
    import time

    import numpy as np
    import pandas as pd
    import pkg_resources

    from seak import data_loaders
    from seak import kernels
    from seak import scoretest

    data_path = pkg_resources.resource_filename('seak', 'data/')

    # Path to veps
    path_to_VEP_bed = data_path + "dummy_veps.bed"
    path_to_VEP_hdf5 = data_path + "dummy_veps.hdf5"

    # Path to genotypes
    path_to_covariates = data_path + "dummy_covariates_fixed.csv"
    path_to_plink_files_with_prefix = data_path + "full_rank_continuous"

    # Path to regions
    path_to_reference_genes_bed = data_path + "dummy_regions.bed"

    # Load data
    # VEPs
    hdf5_loader = data_loaders.Hdf5Loader(path_to_vep_bed=path_to_VEP_bed, path_to_vep_hdf5=path_to_VEP_hdf5,
                                          hdf5_key='diffscore')

    # Genotypes
    plink_loader = data_loaders.VariantLoaderSnpReader(path_to_plink_files_with_prefix+'.bed')

    # Genes
    ucsc_region_loader = data_loaders.BEDRegionLoader(path_to_regions_UCSC_BED=path_to_reference_genes_bed,
                                                      chrom_to_load=1, drop_non_numeric_chromosomes=True)

    # Covariates
    covariate_loader_csv = data_loaders.CovariatesLoaderCSV(phenotype_of_interest='pheno_full_rank_continuous',
                                                            path_to_covariates=path_to_covariates,
                                                            covariate_column_names=['cov1', 'cov2'])

    # Overlap individuals: genotypes and covariates
    print('Overlaps')
    print('Individuals')
    genotypes_covariates_intersection = data_loaders.intersect_ids(plink_loader.get_iids(), covariate_loader_csv.get_iids())

    print(genotypes_covariates_intersection.shape)
    print(genotypes_covariates_intersection)

    # Overlap genotypes with VEPs
    print('Genotypes')
    #print(len(plink_loader.bim.index))
    print(len(hdf5_loader.veps_index_df.index))
    veps_genotypes_intersection = data_loaders.intersect_ids(hdf5_loader.get_vids(), plink_loader.get_vids())
    print(len(veps_genotypes_intersection))

    # Update respective instances
    print('Updates')
    #print('plink_loader.bim.shape', plink_loader.bim.shape)
    plink_loader.update_variants(veps_genotypes_intersection)
    #print('plink_loader.bim.shape', plink_loader.bim.shape)
    print('hdf5_loader.veps_index_df.shape', hdf5_loader.veps_index_df.shape)

    hdf5_loader.update_variants(veps_genotypes_intersection)
    print('hdf5_loader.veps_index_df.shape', hdf5_loader.veps_index_df.shape)

    #print('plink_loader.fam.shape', plink_loader.fam.shape)
    plink_loader.update_individuals(genotypes_covariates_intersection)
    #print('plink_loader.fam.shape', plink_loader.fam.shape)
    print('covariate_loader_csv.cov.shape', covariate_loader_csv.cov.shape)
    covariate_loader_csv.update_individuals(genotypes_covariates_intersection)
    print('covariate_loader_csv.cov.shape', covariate_loader_csv.cov.shape)

    Y, X = covariate_loader_csv.get_one_hot_covariates_and_phenotype(test_type='noK')
    null_model = scoretest.ScoretestNoK(Y, X)
    results = pd.DataFrame(columns=['name', 'chrom', 'start', 'end', 'p_value', 'n_SNVs', 'time'])

    for index, region in ucsc_region_loader.regions.iterrows():
        t_test_gene_start = time.time()
        temp_genotypes_info_dict = region.to_dict()
        temp_genotypes, temp_vids = plink_loader.genotypes_by_region(region)
        if temp_genotypes is None:
            continue

        G, temp_vids = data_loaders.VariantLoader.preprocess_genotypes(temp_genotypes, temp_vids, impute_mean=True,
                                                                       normalize=False, invert_encoding=True,
                                                                       recode_maf=False)
        if G is None:
            continue

        V = hdf5_loader.anno_by_id(temp_vids)

        GV = kernels.diffscore_max(G, V, False)
        temp_p_value = null_model.pv_alt_model(GV)
        temp_genotypes_info_dict['p_value'] = temp_p_value
        temp_genotypes_info_dict['n_SNVs'] = G.shape[1]
        t_test_gene_end = time.time()
        temp_time = float(t_test_gene_end - t_test_gene_start)
        temp_genotypes_info_dict['time'] = temp_time
        results = results.append(temp_genotypes_info_dict, ignore_index=True)

    # results.to_csv('./test_full_rank_continuous.csv')
    print(results)
    reference_result = pd.read_csv(data_path + 'reference_results/test_full_rank_continuous.csv', index_col=0)
    print(np.corrcoef(reference_result['p_value'], results['p_value']))
    print(np.all((np.isclose(reference_result['p_value'], results['p_value']))))
    assert np.all((np.isclose(reference_result['p_value'], results['p_value']))), 'The last change in code changes the result!!'
    Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1))
geno_cond.update_individuals(covariatesloader.get_iids())

# this file contains the mapping of associations to SNPs to condition on
conditional_list = pd.read_csv(snakemake.input.conditional_list,
                               sep='\t',
                               header=0)
conditional_list = conditional_list[
    (conditional_list.pheno == snakemake.params['phenotype']) |
    (conditional_list.pheno == snakemake.wildcards['pheno'])].drop_duplicates(
    )
conditional_list = conditional_list.loc[conditional_list.gene_name.isin(
    results.gene_name)]

geno_cond.update_variants(
    intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids()))
logging.info(
    'considering {} variants as covariates for conditional tests.'.format(
        len(geno_cond.get_vids())))

# set up the null model
Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('NoK')
null_model = ScoretestNoK(Y, X)

logging.info('Phenotype: {}, Sample size: {}'.format(
    snakemake.params.phenotype, len(Y)))


def test_gene(gene):

    # conditional analysis
# storing all results here
results = []

i_gene = 0
i_chrom = 0

# conditional analysis:
# these genotypes will be used for the conditional analysis
geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1))
geno_cond.update_individuals(covariatesloader.get_iids())

# this file contains the mapping of associations to SNPs to condition on
conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0)
conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates()

geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids()))
logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids())))

# enter the chromosome loop:
timer = Timer()
for i, (chromosome, bed, mac_report, vep_tsv) in enumerate(geno_vep):

    if chromosome.replace('chr','') not in regions_all.chrom.unique():
        continue

    if snakemake.params.debug:
        # process only two chromosomes if debugging...
        if i_chrom > 2:
            break

    timer.reset()
# storing all results here
results = []

i_gene = 0
i_chrom = 0

# conditional analysis:
# these genotypes will be used for the conditional analysis
geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1))
geno_cond.update_individuals(covariatesloader.get_iids())

# this file contains the mapping of associations to SNPs to condition on
conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0)
conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates()

geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids()))
logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids())))


# enter the chromosome loop:
timer = Timer()
for i, (chromosome, bed, vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep):

    if chromosome.replace('chr','') not in regions_all.chrom.unique():
        continue

    if snakemake.params.debug:
        # process only two chromosomes if debugging...
        if i_chrom > 2:
            break
Beispiel #12
0
def test_full_rank_continuous():
    # full rank bg kernel
    # imports
    import pkg_resources
    import time

    import numpy as np
    import pandas as pd

    from seak import construct_background_kernel
    from seak import data_loaders
    from seak import kernels
    from seak import scoretest

    from seak.data_loaders import intersect_ids

    data_path = pkg_resources.resource_filename('seak', 'data/')

    # Path to veps
    path_to_VEP_bed = data_path + "dummy_veps.bed"
    path_to_VEP_hdf5 = data_path + "dummy_veps.hdf5"

    # Path to genotypes
    path_to_covariates = data_path + "dummy_covariates_fixed.csv"
    path_to_plink_files_with_prefix = data_path + "full_rank_continuous"

    # Path to regions
    path_to_reference_genes_bed = data_path + "dummy_regions.bed"

    # Path to background kernel
    path_to_plink_bg_kernel = data_path + "full_rank_background_kernel"

    # Load data
    # VEPs
    hdf5_loader = data_loaders.Hdf5Loader(path_to_vep_bed=path_to_VEP_bed,
                                          path_to_vep_hdf5=path_to_VEP_hdf5,
                                          hdf5_key='diffscore')

    # Genotypes
    plink_loader = data_loaders.VariantLoaderSnpReader(
        path_to_plink_files_with_prefix + '.bed')

    # Genes
    ucsc_region_loader = data_loaders.BEDRegionLoader(
        path_to_regions_UCSC_BED=path_to_reference_genes_bed,
        chrom_to_load=1,
        drop_non_numeric_chromosomes=True)

    # Covariates
    covariate_loader_csv = data_loaders.CovariatesLoaderCSV(
        phenotype_of_interest='pheno_full_rank_continuous',
        path_to_covariates=path_to_covariates,
        covariate_column_names=['cov1', 'cov2'])

    # Background_kernel
    # background_kernel = construct_background_kernel.GRMLoader(path_to_plink_bg_kernel, 10, 1)
    background_kernel = construct_background_kernel.GRMLoaderSnpReader(
        path_to_plink_bg_kernel + '.bed', 10, '1')

    # Overlap individuals: genotypes and covariates
    genotypes_covariates_GRM_intersection = data_loaders.intersect_ids(
        plink_loader.get_iids(), covariate_loader_csv.get_iids())
    # Overlap genotypes with VEPs
    veps_genotypes_intersection = data_loaders.intersect_ids(
        hdf5_loader.get_vids(), plink_loader.get_vids())

    # Update respective instances
    plink_loader.update_variants(veps_genotypes_intersection)
    hdf5_loader.update_variants(veps_genotypes_intersection)
    plink_loader.update_individuals(genotypes_covariates_GRM_intersection)
    covariate_loader_csv.update_individuals(
        genotypes_covariates_GRM_intersection)
    background_kernel.update_individuals(genotypes_covariates_GRM_intersection)

    background_kernel.compute_background_kernel()
    print('nb_SNVs_unf: {}, nb_SNVs_f: {}'.format(
        background_kernel.nb_SNVs_unf, background_kernel.nb_SNVs_f))
    print('sum(diag(K)): {}'.format(np.diag(background_kernel.K0).sum()))

    Y, X = covariate_loader_csv.get_one_hot_covariates_and_phenotype(
        test_type='2K')
    null_model = scoretest.Scoretest2K(Y, X, background_kernel.K0,
                                       background_kernel.G0)
    results = pd.DataFrame(
        columns=['name', 'chrom', 'start', 'end', 'p_value', 'n_SNVs', 'time'])

    for region in ucsc_region_loader:
        t_test_gene_start = time.time()
        temp_genotypes_info_dict = region
        temp_genotypes, temp_vids = plink_loader.genotypes_by_region(
            region, return_pos=False)
        if temp_genotypes is None:
            continue

        G, temp_vids = data_loaders.VariantLoader.preprocess_genotypes(
            temp_genotypes,
            temp_vids,
            impute_mean=True,
            normalize=False,
            invert_encoding=True,
            recode_maf=False)
        if G is None:
            continue

        V = hdf5_loader.anno_by_id(temp_vids)

        GV = kernels.diffscore_max(G, V, False)
        temp_p_value = null_model.pv_alt_model(GV)
        temp_genotypes_info_dict['p_value'] = temp_p_value
        temp_genotypes_info_dict['n_SNVs'] = G.shape[1]
        t_test_gene_end = time.time()
        temp_time = float(t_test_gene_end - t_test_gene_start)
        temp_genotypes_info_dict['time'] = temp_time
        results = results.append(temp_genotypes_info_dict, ignore_index=True)

    # results.to_csv('./test_full_rank_continuous_2K.csv')
    reference_result = pd.read_csv(
        data_path +
        'reference_results/test_full_rank_continuous_2K_computed.csv',
        index_col=0)
    print(data_path + 'reference_results/test_full_rank_continuous_2K.csv')

    print('expected result:')
    print(reference_result)
    print('actual result:')
    print(results)
    print('p-value corrcoef:')
    print(np.corrcoef(reference_result['p_value'], results['p_value']))

    # results.to_csv('test_full_rank_continuous_2K_computed.csv')

    assert np.all((np.isclose(
        reference_result['p_value'],
        results['p_value']))), 'The last change in code changes the result!!'