Ejemplo n.º 1
0
def test_read_plink1_bin():

    datafiles = join(dirname(realpath(__file__)), "data_files")
    file_prefix = join(datafiles, "data")
    bim = file_prefix + ".bim"
    bed = file_prefix + ".bed"
    fam = file_prefix + ".fam"

    G = read_plink1_bin(bed, bim, fam, verbose=False)
    assert_equal(G.data.dtype, dtype("float64"))

    snp = G.where((G.chrom == "1") & (G.pos == 72515), drop=True)["snp"].values
    assert_array_equal(snp, ["rs4030300"])

    shape = G.where(G.chrom == "1", drop=True).shape
    assert_array_equal(shape, [3, 10])

    shape = G.where(G.chrom == "2", drop=True).shape
    assert_array_equal(shape, [3, 0])

    g = G.where((G.fid == "Sample_2") & (G.iid == "Sample_2"), drop=True)
    assert_array_equal(g["trait"].values, -9)

    arr = [
        [2.0, 2.0, nan, nan, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0],
        [2.0, 1.0, nan, nan, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0],
        [1.0, 2.0, nan, 1.0, 2.0, 2.0, 0.0, 2.0, 2.0, 2.0],
    ]
    assert_array_equal(G, arr)
Ejemplo n.º 2
0
def read_genotype(geno_prefix):
    try:
        G = read_plink1_bin(geno_prefix + '.bed',
                            geno_prefix + '.bim',
                            geno_prefix + '.fam',
                            ref='a0',
                            verbose=False)
    except Exception:
        return None
    return G
Ejemplo n.º 3
0
def test_read_plink1_bin_wildcard():
    datafiles = join(dirname(realpath(__file__)), "data_files")
    bed_files = join(datafiles, "chr*.bed")

    G = read_plink1_bin(bed_files, verbose=False)
    G.where(G.chrom == "11", drop=True).values
    assert_equal(G.where(G.chrom == "11", drop=True).shape, (14, 779))
    assert_equal(G.where(G.chrom == "12", drop=True).shape, (14, 473))
    x = [[0.00, 0.00], [0.00, 1.00]]
    assert_equal(G.where(G.chrom == "11", drop=True).values[:2, :2], x)
Ejemplo n.º 4
0
def ReadPlink(plink_file, bim, dtype=np.float32):
    Genotype = read_plink1_bin(plink_file + ".bed",
                               plink_file + ".bim",
                               plink_file + ".fam",
                               verbose=False)
    Genotype = Genotype.where(Genotype.snp.isin(
        Genotype.snp.values[bim['index'].values]),
                              drop=True)
    Genotype = Genotype.astype(np.int8)
    G_geno = Genotype.values
    G_geno[np.isnan(G_geno)] = 2
    G_geno = 2 - G_geno
    return G_geno.astype(dtype)
Ejemplo n.º 5
0
def main(args):
    ## Parse arguments
    argp = ap.ArgumentParser(description="")
    argp.add_argument("--bfile", required=True, help="stem of the PLINK binary file name that contains data on both reference individuals and those to be dropped")
    args = argp.parse_args(args)

    bedfile = pdp.read_plink1_bin(str(args.bfile) + ".bed")
    nInds = bedfile.shape[0]
    outped = str(args.bfile) + "_pseudoHap.ped"
    for ind_idx in range(nInds):
        make_pseudohap(x=ind_idx, bedfile=bedfile, outped=outped)
    os.system("awk \'{print $1, $2, $3, $4}\' " + str(args.bfile)+ ".bim > " + str(args.bfile) + "_pseudoHap.map")
    args.bfile = str(args.bfile) + "_pseudoHap"
    os.system("plink1.9 --file " + str(args.bfile) + " --make-bed --out " + str(args.bfile))
    os.system("rm " + str(args.bfile) + ".ped")
    os.system("rm " + str(args.bfile)+ ".map")
    return 0
Ejemplo n.º 6
0
    def load_raw_bed(self):
        from pandas_plink import read_plink1_bin
        print(
            'WARNING: THESE FUNCTIONS ARE VERY TIME CONSUMING. IT MIGHT TAKE UP TO 48 h'
        )
        # read parts of bed values https://github.com/limix/pandas-plink/blob/master/doc/usage.rst
        # https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
        """Loading Files"""
        # os.chdir(path_input)
        G = read_plink1_bin(self.path_raw + "Peds_CIO_merged_qc_data.bed",
                            bim=None,
                            fam=None,
                            verbose=False)
        samples = G.sample.values  # samples
        variants = G.variant.values
        s, v = len(samples), len(variants)
        print('Original shape: ', s, v)  # Shape:  454 6726287
        cadd = self.raw_cadd()
        '''Saving samples output'''
        np.save(self.path + 'samples', samples)
        '''Making sure the Cadd and variants are in the same order (very important)'''
        cadd['variants_cat'] = pd.Categorical(cadd['variants'],
                                              categories=variants,
                                              ordered=True)
        cadd_sort = cadd.sort_values(by=['variants_cat'])
        cadd_sort.reset_index(inplace=True)
        if np.equal(cadd_sort.variants, variants).sum() == len(variants):
            print('CADD and variants are in the same order')
            del cadd
        else:
            print('ERROR: CADD and variantres are in DIFFERENT order')

        cadd_sort.fillna(value={'CADD_PHRED': 0}, inplace=True)
        """First PRUNE: IF 0 IN ONE AND 1 ON ANOTHER: 2, IF 1 AND 0: 0, IF 0 AND 0: 1"""
        # Takes 48 hours to finish
        data_d, data_s, variants_, cadd_ = self.filteringSNPS(
            variants, cadd_sort.CADD_PHRED.values, samples, G, 'f1')
        """FINAL PRUNE: IF 0 IN ONE AND 1 ON ANOTHER: 2, IF 1 AND 0: 0, IF 0 AND 0: 1"""
        data_d, data_s, variants_, cadd_ = self.filteringSNPS(
            np.array(variants_), np.array(cadd_), samples, G, '', 0.2)
        fixing_erros()
        adding_known_snps_back(G, samples, variants_, cadd_, cadd_sort, data_d,
                               data_s)
Ejemplo n.º 7
0
def test_read_plink1_bin_wildcard_not_found():
    datafiles = join(dirname(realpath(__file__)), "data_files")
    bed_files = join(datafiles, "chrr*.bed")

    with pytest.raises(ValueError):
        read_plink1_bin(bed_files, verbose=False)

    bed_files = join(datafiles, "chr*.bed")
    with pytest.raises(ValueError):
        read_plink1_bin(bed_files, "chr11.bim", verbose=False)

    bed_files = join(datafiles, "chr*.bed")
    bim_files = join(datafiles, "chrr*.bim")
    with pytest.raises(ValueError):
        read_plink1_bin(bed_files, bim_files, verbose=False)

    bed_files = join(datafiles, "chr*.bed")
    bim_files = join(datafiles, "chr*.bim")
    fam_files = join(datafiles, "chr*.fam")
    with pytest.warns(UserWarning):
        read_plink1_bin(bed_files, bim_files, fam_files, verbose=True)
Ejemplo n.º 8
0
    def read_bed(path, type_pos):
        '''
		Read the .bed, .bim and .fam files
		
		Parameter :
		---------------
		path : string
			path to .bed, .bim. fam files without the extension. They need to be all in the same folder
		type_pos : integer
			equal to 1 if we use the Morgans, equal to 2 if we use the base-pair coordinates

		Return :
		---------------
		gen_A : pandas.DataFrame
			DataFrame with all the genotypes in the additiv model
		pos : pandas.Series
			Series with the position of the SNP in Morgans
		chr : pandas.Series
			Series with the number of the chromosome for each SNP
				
		'''

        data = read_plink1_bin(path + ".bed",
                               path + ".bim",
                               path + ".fam",
                               verbose=False)
        print("read OK")
        gen_A = pd.DataFrame(data=np.transpose(data.values),
                             index=data.variant.snp.values)
        print("gen_A ok")
        if type_pos == 1:
            pos = pd.Series(np.transpose(data.variant.cm.values),
                            index=data.variant.snp.values)
        elif type_pos == 2:
            pos = pd.Series(np.transpose(data.variant.pos.values),
                            index=data.variant.snp.values)
        print("pos ok")
        chrom = pd.Series(np.transpose(data.variant.chrom.values),
                          index=data.variant.snp.values)
        print("chrom ok")
        return (gen_A, pos, chrom)
Ejemplo n.º 9
0
    def read_genotypes(self, genotype_files, ld_block_files, standardize=True):
        """
        Read the genotype files
        :return:
        """

        if self.verbose:
            print("> Reading genotype files...")

        if not iterable(ld_block_files):
            ld_block_files = [ld_block_files]

        self.n_per_snp = {}
        self.genotypes = {}
        self.bed_files = {}

        for i, (bfile, ldb_file) in tqdm(enumerate(
                zip_longest(genotype_files, ld_block_files)),
                                         disable=not self.verbose):

            # Read plink file:
            try:
                gt_ac = read_plink1_bin(bfile + ".bed",
                                        ref="a0",
                                        verbose=False)
            except ValueError:
                gt_ac = read_plink1_bin(bfile, ref="a0", verbose=False)
            except Exception as e:
                self.genotypes = None
                self.sample_ids = None
                raise e

            gt_ac = gt_ac.set_index(variant='snp')

            # Filter individuals:
            if self.keep_individuals is not None:

                common_samples = pd.DataFrame({
                    'Sample': gt_ac.sample.values
                }).merge(
                    pd.DataFrame({'Sample': self.keep_individuals},
                                 dtype=type(
                                     gt_ac.sample.values[0])))['Sample'].values

                gt_ac = gt_ac.sel(sample=common_samples)

            # Filter SNPs:
            if self.keep_snps is not None:
                common_snps = pd.DataFrame({
                    'SNP': gt_ac.variant.values
                }).merge(pd.DataFrame({'SNP': self.keep_snps}))['SNP'].values
                gt_ac = gt_ac.sel(variant=common_snps)

            # Obtain information about current chromosome:
            chr_id, (chr_n, chr_p) = int(gt_ac.chrom.values[0]), gt_ac.shape

            # Assign the number of samples per SNP
            # This accounts for missing data
            self.n_per_snp[chr_id] = gt_ac.shape[0] - gt_ac.isnull().sum(
                axis=0).compute().values

            maf = gt_ac.sum(axis=0) / (2. * self.n_per_snp[chr_id])
            #maf = np.round(np.where(maf > .5, 1. - maf, maf), 6)
            gt_ac = gt_ac.assign_coords({"MAF": ("variant", maf)})

            # Standardize genotype matrix:
            if standardize:
                gt_ac = (gt_ac - gt_ac.mean(axis=0)) / gt_ac.std(axis=0)
                self.standardized_genotype = standardize
                gt_ac = gt_ac.fillna(0.)

            # Add filename to the bedfiles dictionary:
            self.bed_files[chr_id] = bfile

            if i == 0:
                self.sample_ids = gt_ac.sample.values

            # TODO: Harmonize the code given the updated keys (using chrom_id now).
            self.genotypes[chr_id] = {'CHR': chr_id, 'G': gt_ac}

            # If an LD block file is provided, then read it,
            # match snps with their corresponding blocks,
            # and create a list of snp coordinates in each block:
            if ldb_file is not None:

                # Read LD block file:
                ldb_df = pd.read_csv(ldb_file, delim_whitespace=True)

                # Create a SNP dataframe with BP position:
                snp_df = pd.DataFrame({'pos': gt_ac.pos.values})

                # Assign each SNP its appropriate block ID
                snp_df['block_id'] = snp_df['pos'].apply(
                    lambda pos: ldb_df.loc[(pos >= ldb_df['start']) &
                                           (pos < ldb_df['stop'])].index[0])

                ld_blocks = []

                for b_idx in range(len(ldb_df)):
                    ld_blocks.append(
                        da.array(snp_df.loc[snp_df['block_id'] ==
                                            b_idx].index.tolist()))

                self.genotypes[i]['LD Blocks'] = ld_blocks
Ejemplo n.º 10
0
def read_plink(bed_file):
    snp_info = read_plink1_bin(bed_file + ".bed",
                               bed_file + ".bim",
                               bed_file + ".fam",
                               verbose=False)
    return snp_info.values
Ejemplo n.º 11
0
def load_plink_array(path_to_plink_files: Optional[Union[str, Path]] = None,
                     bed: Optional[Union[str, Path]] = None,
                     bim: Optional[Union[str, Path]] = None,
                     fam: Optional[Union[str, Path]] = None,
                     transpose: bool = False) -> da.core.Array:
    """Gathers plink array from possible formats

    Requires one of the following parameter configures to be satisfied to load an array:
        - array is specified and no other parameters are specified
        - path_to_files is specified and no other parameters are specified
        - bim AND fam AND bed are specified and no other parameters are specified

    Parameters
    ----------
    path_to_plink_files : path_like, optional
        Assuming bim, fam, bed files are in the following format
            </path/to/data.bim>
            </path/to/data.fam>
            </path/to/data.bed>
        Then, path_to_files would be '/path/to/data'
    bed : path_like, optional
        '/path/to/data.bed'
    bim : path_like, optional
        '/path/to/data.bim'
    fam : path_like, optional
        '/path/to/data.fam'
    transpose : bool
        Whether `array` is stored/loaded in transposed format

        If A is stored/loaded as A.T but SVD(A) is desired set transpose flag to True

    Returns
    -------
    array : dask.array.core.Array

    """
    if path_to_plink_files is not None and not all([bed, bim, fam]):
        (_, _, G) = read_plink(path_to_plink_files)
        array = G
    elif all(p is not None
             for p in [bed, bim, fam]) and not path_to_plink_files:
        G = read_plink1_bin(bed, bim, fam)
        array = G.data
    else:
        raise ValueError(
            'Uninterpretable input.'
            ' Please specify array, xor path_to_files, xor (bed and bim and fim)'
        )

    try:
        array = da.from_array(array)
    except AttributeError:
        raise ValueError('Uninterpretable array.')
    except ValueError:
        pass

    if len(array.shape) != 2:
        raise ValueError("Must be a 2-D array")

    if transpose:
        array = array.T

    return array
            TType=torch.cuda.FloatTensor
    else:
        if args.double:
            TType=torch.DoubleTensor
        else:
            TType=torch.FloatTensor
    if args.nosubnormal:
        torch.set_flush_denormal(True)
        #floatlib.set_ftz()
        #floatlib.set_daz()
    
    #n = int(args.cols); p = int(args.rows)
    bedfile = "/shared/ukbiobank_filtered/filtered_200k.bed"
    famfile = "/shared/ukbiobank_filtered/filtered_200k.2.fam"

    G = read_plink1_bin(bedfile, fam=famfile, verbose=False)

    n = G.shape[0]
    p_pheno = 11
    p = G.shape[1] + 6

    start_ind = (p // size) * rank
    end_ind   = (p // size) * (rank + 1) 
    pheno = genfromtxt("/shared/ukbiobank_filtered/ukb_short.filtered.200k.tab", skip_header=1)

    if rank != size - 1:
        X_chunk = G[:, start_ind:end_ind].data.compute()
    else:
        X_chunk = da.hstack([G[:,start_ind:].data, da.zeros((n, 6))]).compute()
        X_chunk[:, -11:] = pheno[:, 1:p_pheno + 1]
Ejemplo n.º 13
0
def load_genotype_from_bedfile(bedfile, indiv_list, snplist_to_exclude, chromosome=None, load_first_n_samples=None, 
    missing_rate_cutoff=0.5, return_snp=False, standardize=True):
    G = read_plink1_bin(bedfile, verbose=False)
    
    if chromosome is not None:
        chr_str = G.chrom[0].values.tolist()
        if 'chr' in chr_str:
            chromosome = 'chr' + str(chromosome)
        else:
            chromosome = str(chromosome)
        G = G.where(G.chrom == chromosome, drop=True)
    
    df_geno_indiv = pd.DataFrame({'indiv': G.sample.to_series().tolist()})
    df_geno_indiv['idx'] = [ i for i in range(df_geno_indiv.shape[0]) ]
    
    if indiv_list is None:
        indiv_list = G.sample.to_series().tolist()
        if load_first_n_samples is not None:
            indiv_list = indiv_list[:load_first_n_samples]
    df_target_indiv = pd.DataFrame({'indiv': indiv_list})
    df_geno_indiv = pd.merge(df_geno_indiv, df_target_indiv, on='indiv').sort_values(by=['idx'])
    if df_geno_indiv.shape[0] != len(indiv_list):
        raise ValueError('There are input individuals that do not appear in BED file.')
    query_indiv_list = df_geno_indiv.indiv.tolist()
    
    
    snpid = G.variant.variant.to_series().to_list()
    snpid = np.array([ s.split('_')[1] for s in snpid ])
    if return_snp is True:
        a0 = G.variant.a0.to_series().to_numpy()
        a1 = G.variant.a1.to_series().to_numpy()       
        chrom = G.variant.chrom.to_series().to_numpy()    
    
    geno = G.sel(sample=query_indiv_list).values

    # re-order to target indiv_list
    geno = geno[match_y_to_x(np.array(query_indiv_list), np.array(indiv_list)), :]
    
    # filter out unwanted snps
    geno = geno[:, ~np.isin(snpid, snplist_to_exclude)]
    if return_snp is True:
        a0 = a0[~np.isin(snpid, snplist_to_exclude)]
        a1 = a1[~np.isin(snpid, snplist_to_exclude)]
        chrom = chrom[~np.isin(snpid, snplist_to_exclude)]
        
    snpid = snpid[~np.isin(snpid, snplist_to_exclude)]
   
    # filter out genotypes with high missing rate
    missing_rate = np.isnan(geno).mean(axis=0)
    geno = geno[:, missing_rate < missing_rate_cutoff]
    if return_snp is True:
        snpid = snpid[missing_rate < missing_rate_cutoff]
        a0 = a0[missing_rate < missing_rate_cutoff]
        a1 = a1[missing_rate < missing_rate_cutoff]
        chrom = chrom[missing_rate < missing_rate_cutoff]
        
    maf = np.nanmean(geno, axis=0) / 2
    
    # impute genotype missing value
    miss_x, miss_y = np.where(np.isnan(geno))
    geno[(miss_x, miss_y)] = maf[miss_y] * 2
    var_geno = 2 * maf * (1 - maf)
    
    # keep only genotypes with variance != 0
    to_keep = var_geno != 0
    geno = geno[:, to_keep]
    if return_snp is True:
        snpid = snpid[to_keep]
        a0 = a0[to_keep]
        a1 = a1[to_keep]
        chrom = chrom[to_keep]
        
    maf = maf[to_keep]
    var_geno = var_geno[to_keep]
    if standardize is True:
        geno = (geno - 2 * maf) / np.sqrt(var_geno)
    
    if return_snp is True:
        return geno, indiv_list, np.sqrt(var_geno), (snpid.tolist(), a0.tolist(), a1.tolist(), chrom.tolist())
    else:
        return geno, indiv_list, np.sqrt(var_geno)
Ejemplo n.º 14
0
import pandas_plink as pp
from dask.diagnostics import ProgressBar
for i in range(1, 23):
    G = pp.read_plink1_bin("PATH_TO_BED_TRAIN" + str(i) + '.bed')
    G = G.astype('int8')
    G = G.to_dataset()
    with ProgressBar():
        G.compute()
    with ProgressBar():
        G.to_zarr('PATH_TO_TRAINING_DATA' + str(i))
Ejemplo n.º 15
0
import pandas_plink as pp
from dask.diagnostics import ProgressBar
for i in range(1, 24):
    G = pp.read_plink1_bin("PATH_TO_CHROMOSOME_DATA_" + str(i) + '.bed')
    G = G.astype('int8')
    G = G.to_dataset()
    with ProgressBar():
        G.compute()
    with ProgressBar():
        G.to_zarr('PATH_TO_ZARR_' + str(i))
Ejemplo n.º 16
0
    def plink_inputs(self):
        # Initializing some variables
        plink_exec = genoml.dependencies.check_plink()
        impute_type = self.impute_type
        addit_df = self.addit_df
        pheno_df = self.pheno_df

        outfile_h5 = self.run_prefix + ".dataForML.h5"
        pheno_df.to_hdf(outfile_h5, key='pheno', mode='w')

        if (self.geno_path != None):
            if (self.skip_prune == "no"):
                # Set the bashes
                bash1a = f"{plink_exec} --bfile " + self.geno_path + " --indep-pairwise 1000 50 " + self.r2
                bash1b = f"{plink_exec} --bfile " + self.geno_path + " --extract " + self.run_prefix + \
                    ".p_threshold_variants.tab" + " --indep-pairwise 1000 50 " + self.r2
                # may want to consider outputting temp_genos to dir in run_prefix
                bash2 = f"{plink_exec} --bfile " + self.geno_path + \
                    " --extract plink.prune.in --make-bed --out temp_genos"
                bash3 = "cut -f 2,5 temp_genos.bim > " + \
                    self.run_prefix + ".variants_and_alleles.tab"
                bash4 = "rm plink.log"
                bash5 = "rm plink.prune.*"
                #    bash6 = "rm " + self.run_prefix + ".log"
                # Set the bash command groups
                cmds_a = [bash1a, bash2, bash3, bash4, bash5]
                cmds_b = [bash1b, bash2, bash3, bash4, bash5]

                if (self.gwas_path != None) & (self.geno_path != None):
                    p_thresh = self.p_gwas
                    gwas_df_reduced = self.gwas_df[['SNP', 'p']]
                    snps_to_keep = gwas_df_reduced.loc[(gwas_df_reduced['p'] <=
                                                        p_thresh)]
                    outfile = self.run_prefix + ".p_threshold_variants.tab"
                    snps_to_keep.to_csv(outfile, index=False, sep="\t")
                    print(
                        f"Your candidate variant list prior to pruning is right here: {outfile}."
                    )

                if (self.gwas_path == None) & (self.geno_path != None):
                    print(
                        f"A list of pruned variants and the allele being counted in the dosages (usually the minor allele) can be found here: {self.run_prefix}.variants_and_alleles.tab"
                    )
                    for cmd in cmds_a:
                        subprocess.run(cmd, shell=True)

                if (self.gwas_path != None) & (self.geno_path != None):
                    print(
                        f"A list of pruned variants and the allele being counted in the dosages (usually the minor allele) can be found here: {self.run_prefix}.variants_and_alleles.tab"
                    )
                    for cmd in cmds_b:
                        subprocess.run(cmd, shell=True)

            if (self.skip_prune == "yes"):
                bash1a = f"{plink_exec} --bfile " + self.geno_path
                bash1b = f"{plink_exec} --bfile " + self.geno_path + " --extract " + self.run_prefix + ".p_threshold_variants.tab" + " --make-bed --out temp_genos"
                # may want to consider outputting temp_genos to dir in run_prefix
                bash2 = f"{plink_exec} --bfile " + self.geno_path + " --make-bed --out temp_genos"
                bash3 = "cut -f 2,5 temp_genos.bim > " + self.run_prefix + ".variants_and_alleles.tab"
                bash4 = "rm plink.log"

                # Set the bash command groups
                cmds_a = [bash1a, bash2, bash3, bash4]
                cmds_b = [bash1b, bash3, bash4]

                if (self.gwas_path != None) & (self.geno_path != None):
                    p_thresh = self.p_gwas
                    gwas_df_reduced = self.gwas_df[['SNP', 'p']]
                    snps_to_keep = gwas_df_reduced.loc[(gwas_df_reduced['p'] <=
                                                        p_thresh)]
                    outfile = self.run_prefix + ".p_threshold_variants.tab"
                    snps_to_keep.to_csv(outfile, index=False, sep="\t")
                    print(
                        f"Your candidate variant list is right here: {outfile}."
                    )

                if (self.gwas_path == None) & (self.geno_path != None):
                    print(
                        f"A list of variants and the allele being counted in the dosages (usually the minor allele) can be found here: {self.run_prefix}.variants_and_alleles.tab"
                    )
                    for cmd in cmds_a:
                        subprocess.run(cmd, shell=True)

                if (self.gwas_path != None) & (self.geno_path != None):
                    print(
                        f"A list of variants and the allele being counted in the dosages (usually the minor allele) can be found here: {self.run_prefix}.variants_and_alleles.tab"
                    )
                    for cmd in cmds_b:
                        subprocess.run(cmd, shell=True)

        if (self.geno_path != None):

            g = read_plink1_bin('temp_genos.bed')
            g_pruned = g.drop([
                'fid', 'father', 'mother', 'gender', 'trait', 'chrom', 'cm',
                'pos', 'a1'
            ])

            g_pruned = g_pruned.set_index({'sample': 'iid', 'variant': 'snp'})
            g_pruned.values = g_pruned.values.astype('int')

            # swap pandas-plink genotype coding to match .raw format...more about that below:

            # for example, assuming C in minor allele, alleles are coded in plink .raw labels homozygous for minor allele as 2 and homozygous for major allele as 0:
            #A A   ->    0
            #A C   ->    1
            #C C   ->    2
            #0 0   ->   NA

            # where as, read_plink1_bin flips these, with homozygous minor allele = 0 and homozygous major allele = 2
            #A A   ->    2
            #A C   ->    1
            #C C   ->    0
            #0 0   ->   NA

            two_idx = (g_pruned.values == 2)
            zero_idx = (g_pruned.values == 0)

            g_pruned.values[two_idx] = 0
            g_pruned.values[zero_idx] = 2

            g_pd = g_pruned.to_pandas()
            g_pd.reset_index(inplace=True)
            raw_df = g_pd.rename(columns={'sample': 'ID'})
            #    del raw_df.index.name
            #    del raw_df.columns.name

            # now, remove temp_genos
            bash_rm_temp = "rm temp_genos.*"
            print(bash_rm_temp)
            subprocess.run(bash_rm_temp, shell=True)

    # Checking the impute flag and execute
    # Currently only supports mean and median
        impute_list = ["mean", "median"]

        if (self.geno_path != None):

            if impute_type not in impute_list:
                return "The 2 types of imputation currently supported are 'mean' and 'median'"
            elif impute_type.lower() == "mean":
                raw_df = raw_df.fillna(raw_df.mean())
            elif impute_type.lower() == "median":
                raw_df = raw_df.fillna(raw_df.median())
            print("")
            print(
                f"You have just imputed your genotype features, covering up NAs with the column {impute_type} so that analyses don't crash due to missing data."
            )
            print(
                "Now your genotype features might look a little better (showing the first few lines of the left-most and right-most columns)..."
            )
            print("#" * 70)
            print(raw_df.describe())
            print("#" * 70)
            print("")

    # Checking the imputation of non-genotype features

        if (self.addit_path != None):
            if impute_type not in impute_list:
                return "The 2 types of imputation currently supported are 'mean' and 'median'"
            elif impute_type.lower() == "mean":
                addit_df = addit_df.fillna(addit_df.mean())
            elif impute_type.lower() == "median":
                addit_df = addit_df.fillna(addit_df.median())
            print("")
            print(
                f"You have just imputed your non-genotype features, covering up NAs with the column {impute_type} so that analyses don't crash due to missing data."
            )
            print(
                "Now your non-genotype features might look a little better (showing the first few lines of the left-most and right-most columns)..."
            )
            print("#" * 70)
            print(addit_df.describe())
            print("#" * 70)
            print("")

            # Remove the ID column
            cols = list(addit_df.columns)
            cols.remove('ID')
            addit_df[cols]

            # Z-scale the features
            print(f"Now Z-scaling your non-genotype features...")

            # Remove any columns with a standard deviation of zero
            print(
                f"Removing any columns that have a standard deviation of 0 prior to Z-scaling..."
            )

            if any(addit_df.std() == 0.0):
                print("")
                print(
                    f"Looks like there's at least one column with a standard deviation of 0. Let's remove that for you..."
                )
                print("")
                addit_keep = addit_df.drop(
                    addit_df.std()[addit_df.std() == 0.0].index.values, axis=1)
                addit_keep_list = list(addit_keep.columns.values)

                addit_df = addit_df[addit_keep_list]

                addit_keep_list.remove('ID')
                removed_list = np.setdiff1d(cols, addit_keep_list)
                for removed_column in range(len(removed_list)):
                    print(
                        f"The column {removed_list[removed_column]} was removed"
                    )

                cols = addit_keep_list

            print("")
            for col in cols:
                if (addit_df[col].min() == 0.0) and (addit_df[col].max()
                                                     == 1.0):
                    print(
                        col,
                        "is likely a binary indicator or a proportion and will not be scaled, just + 1 all the values of this variable and rerun to flag this column to be scaled."
                    )
                else:
                    addit_df[col] = (addit_df[col] - addit_df[col].mean()
                                     ) / addit_df[col].std(ddof=0)

            print("")
            print(
                "You have just Z-scaled your non-genotype features, putting everything on a numeric scale similar to genotypes."
            )
            print(
                "Now your non-genotype features might look a little closer to zero (showing the first few lines of the left-most and right-most columns)..."
            )
            print("#" * 70)
            print(addit_df.describe())
            print("#" * 70)

    # Saving out the proper HDF5 file
        if (self.geno_path != None):
            merged = raw_df.to_hdf(outfile_h5, key='geno')

        if (self.addit_path != None):
            merged = addit_df.to_hdf(outfile_h5, key='addit')

        if (self.geno_path != None) & (self.addit_path != None):
            pheno = pd.read_hdf(outfile_h5, key="pheno")
            geno = pd.read_hdf(outfile_h5, key="geno")
            addit = pd.read_hdf(outfile_h5, key="addit")
            temp = pd.merge(pheno, addit, on='ID', how='inner')
            merged = pd.merge(temp, geno, on='ID', how='inner')

        if (self.geno_path != None) & (self.addit_path == None):
            pheno = pd.read_hdf(outfile_h5, key="pheno")
            geno = pd.read_hdf(outfile_h5, key="geno")
            merged = pd.merge(pheno, geno, on='ID', how='inner')

        if (self.geno_path == None) & (self.addit_path != None):
            pheno = pd.read_hdf(outfile_h5, key="pheno")
            addit = pd.read_hdf(outfile_h5, key="addit")
            merged = pd.merge(pheno, addit, on='ID', how='inner')

        # Checking the reference column names flag
        # If this is a step that comes after harmonize, then a .txt file with columns to keep should have been produced
        # This is a list of column names from the reference dataset that the test dataset was harmonized against
        # We want to compare apples to apples, so we will only keep the column names that match
        if (self.refColsHarmonize != None):
            print("")
            print(
                f"Looks like you are munging after the harmonization step. Great! We will keep the columns generated from your reference dataset from that harmonize step that was exported to this file: {self.refColsHarmonize}"
            )
            print("")
            with open(self.refColsHarmonize, 'r') as refCols_file:
                ref_column_names_list = refCols_file.read().splitlines()

            # Keep the reference columns from the test dataset if found in test data
            matching_cols = merged[np.intersect1d(merged.columns,
                                                  ref_column_names_list)]

            # Make a list of final features that will be included in the model
            # This will be used again when remunging the reference dataset
            matching_cols_list = matching_cols.columns.values.tolist()

            # Save out the final list
            intersecting_cols_outfile = self.run_prefix + ".finalHarmonizedCols_toKeep.txt"

            with open(intersecting_cols_outfile, 'w') as filehandle:
                for col in matching_cols_list:
                    filehandle.write('%s\n' % col)

            print(
                f"A final list of harmonized columns between your reference and test dataset has been generated here: {intersecting_cols_outfile}"
            )
            print(
                f"Use this to re-train your reference dataset in order to move on to testing."
            )

            # Replace the dataframe variable with the matching options
            merged = matching_cols

        self.merged = merged
        merged.to_hdf(outfile_h5, key='dataForML')

        features_list = merged.columns.values.tolist()

        features_listpath = self.run_prefix + ".list_features.txt"
        with open(features_listpath, 'w') as f:
            for feature in features_list:
                f.write("%s\n" % feature)

        print(
            f"An updated list of {len(features_list)} features, including ID and PHENO, that is in your munged dataForML.h5 file can be found here {features_listpath}"
        )

        print("")
        print(
            f"Your .dataForML file that has been fully munged can be found here: {outfile_h5}"
        )

        return merged