def __init__(self, prefix): """Binary plink file reader. Args: prefix (str): the prefix of the Plink binary files. """ self.bed = PyPlink(prefix) self.bim = self.bed.get_bim() self.fam = self.bed.get_fam() # Identify all multi-allelics. self.bim["multiallelic"] = False self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False), "multiallelic"] = True # We want to set the index for the FAM file try: self.fam = self.fam.set_index("iid", verify_integrity=True) except ValueError: logger.info( "Setting the index as 'fid_iid' because the individual IDs " "are not unique.") self.fam["fid_iid"] = [ "{fid}_{iid}".format(fid=fid, iid=iid) for fid, iid in zip(self.fam.fid, self.fam.iid) ] self.fam = self.fam.set_index("fid_iid", verify_integrity=True)
def __init__(self, plinkstem): """ :param plinkstem: plink stem file path """ self._plinkstem = plinkstem self._bim_path = os.path.basename(self._plinkstem)+'.bim' self._bed_path = os.path.basename(self._plinkstem)+'.bed' self._fam_path = os.path.basename(self._plinkstem)+'.fam' self.plinkfile = PyPlink(self._plinkstem) self.fam = self.plinkfile.get_fam() self.bim = self.plinkfile.get_bim() self.N = self.fam.shape[0] self.P = self.bim.shape[0] self._sample_subjects = None self._sample_variants = None
def __init__(self, plink_path: str, pheno_path: str, variant_path: str): lg.debug(""" Loading the following files: Plink: %s Pheno %s Variants: %s """, plink_path, pheno_path, variant_path) assert os.path.isfile(plink_path+'.bed') assert os.path.isfile(variant_path) self.plink_path = plink_path self.variant_path = variant_path self.bfile = PyPlink(self.plink_path) self.bim = self.bfile.get_bim() self.bim['rsid'] = self.bim.index.values self.fam = self.bfile.get_fam() self.n_chrom = self.bim.chrom.nunique() self.variants = self._get_var(self.variant_path) self.genes = self.variants.gene.unique() self.pheno = self._get_pheno(pheno_path) self.bfile.close()
def _read_gene(self, gene: str) -> np.array: temp = self.variants[self.variants.gene == gene] chrom = temp.chrom.unique() assert len(chrom) == 1 lg.debug(temp.head()) marker = temp.rsid.values lg.debug(marker) p = len(marker) assert p > 3 genotype_matrix = np.zeros((self.n, p)) reader = PyPlink(self.plink_path) u = 0 lg.info('Reading %s', gene) for i, g in reader.iter_geno_marker(marker): genotype_matrix[:, u] = g u += 1 lg.debug('Processed variant %s', i) genotype_matrix[genotype_matrix == -1] = 0 reader.close() return genotype_matrix
def load_bed(geno_temp, chromo=1, indiv=None, mid_buffer=2e6): '''load bed-file and split in effect and null SNPs # Parameters: chromo (int): chromosome number indiv (None or np.ndarray): if None, use all individuals in bed-file if np.array, use elements as indivduals mid_buffer (int): number of bp up- and downstream of center of chromosome to leave out # Returns: G (pd.DataFrame): DataFrame with one indiv per row, one SNP per column eff_snps, null_snps (np.ndarray): array of ints with position of effect- and null-SNPs rsid (np.ndarray) array of str with rsid names of SNPs in G ''' print(chromo) bed = PyPlink(geno_temp % chromo) print(bed.get_nb_markers()) fam = bed.get_fam() if indiv is None: indiv = fam.iid.astype(int) ind = np.isin(fam.iid.astype(int), indiv) indiv = fam.loc[ind, 'iid'].astype(int).values G = [] rsids = [] removed = 0 for g in tqdm(bed.iter_geno(), total=bed.get_nb_markers()): rs = g[0] gen = g[1][ind] g_ind = gen == -1 if g_ind.mean() < 0.1: gen[g_ind] = gen[~g_ind].mean() G.append(gen) rsids.append(rs) else: removed += 1 print(f'removed {removed} SNPs due to missing>10%') G = pd.DataFrame( np.array(G).T, index=indiv, columns=['c%d:%d' % (chromo, x) for x in range(len(rsids))], ) bim = bed.get_bim().loc[rsids] mid = bim.pos.min() + (bim.pos.max() - bim.pos.min()) // 2 eff_snps = np.where(bim.pos < mid - mid_buffer)[0] null_snps = np.where(bim.pos > mid + mid_buffer)[0] return G, eff_snps, null_snps, np.array(rsids)
def setUpClass(cls): # Loading the data data = pd.read_csv( resource_filename(__name__, "data/statistics/linear.txt.bz2"), sep="\t", compression="bz2", ) # Creating the index data["sample"] = ["s{}".format(i + 1) for i in range(data.shape[0])] data = data.set_index("sample") # Creating the dummy phenotype container cls.phenotypes = _DummyPhenotypes() cls.phenotypes.data = data.drop( ["snp{}".format(i + 1) for i in range(5)], axis=1, ) # Creating a temporary directory cls.tmp_dir = TemporaryDirectory(prefix="genetest_test_linear_") # The plink file prefix cls.plink_prefix = os.path.join(cls.tmp_dir.name, "input") # Permuting the sample to add a bit of randomness new_sample_order = np.random.permutation(data.index) # Creating the BED file with PyPlink(cls.plink_prefix, "w") as bed: for snp in [s for s in data.columns if s.startswith("snp")]: bed.write_genotypes(data.loc[new_sample_order, snp]) # Creating the BIM file with open(cls.plink_prefix + ".bim", "w") as bim: print(3, "snp1", 0, 1234, "T", "C", sep="\t", file=bim) print(3, "snp2", 0, 9618, "C", "A", sep="\t", file=bim) print(2, "snp3", 0, 1519, "G", "T", sep="\t", file=bim) print(1, "snp4", 0, 5871, "G", "A", sep="\t", file=bim) print(23, "snp5", 0, 2938, "T", "C", sep="\t", file=bim) # Creating the FAM file with open(cls.plink_prefix + ".fam", "w") as fam: for sample in new_sample_order: print(sample, sample, 0, 0, 0, -9, file=fam) # Creating the genotype parser cls.genotypes = parsers["plink"](cls.plink_prefix)
def _get_matrix(pfile, max_block): """Extract a genotype matrix from plink file.""" with PyPlink(pfile) as bed: bim = bed.get_bim() fam = bed.get_fam() n = fam.shape[0] p = bim.shape[0] assert (max_block <= p) genotypemat = np.zeros((n, max_block), dtype=np.int64) u = 0 for loci_name, genotypes in bed: genotypemat[:, u] = np.array(genotypes) u += 1 if u >= max_block: break return genotypemat
def setUpClass(cls): # Creating random data cls.data = pd.DataFrame( dict( pheno=np.random.randint(1, 100, 100), var1=np.random.randint(1, 100, 100), var2=np.random.rand(100), var3=["x{}".format(i) for i in np.random.randint(0, 3, 100)], var4=["y{}".format(i) for i in np.random.randint(0, 2, 100)], var5=np.random.randint(0, 4, 100), snp=binom.rvs(2, 0.3, size=100), ), index=["sample_{}".format(i + 1) for i in range(100)], ) # Changing one factor to categorical data cls.data.loc[:, "var5"] = cls.data.var5.astype("category") # Creating the dummy phenotype container phenotypes = ["pheno"] + ["var{}".format(i + 1) for i in range(5)] cls.phenotypes = _DummyPhenotypes() cls.phenotypes.data = cls.data[phenotypes].copy() # Creating a temporary directory cls.tmp_dir = TemporaryDirectory(prefix="genetest_") # The plink file prefix cls.plink_prefix = path.join(cls.tmp_dir.name, "input") # Permuting the sample to add a bit of randomness new_sample_order = np.random.permutation(cls.data.index) # Creating the BED file with PyPlink(cls.plink_prefix, "w") as bed: bed.write_genotypes(cls.data.loc[new_sample_order, "snp"]) # Creating the BIM file with open(cls.plink_prefix + ".bim", "w") as bim: print(1, "snp", 0, 1, "B", "A", sep="\t", file=bim) # Creating the FAM file with open(cls.plink_prefix + ".fam", "w") as fam: for sample in new_sample_order: print(sample, sample, 0, 0, 0, -9, file=fam) # Creating the genotype parser cls.genotypes = parsers["plink"](cls.plink_prefix)
def setUpClass(cls): # Loading the data data = pd.read_csv( resource_filename(__name__, "data/statistics/factors.txt.bz2"), sep="\t", compression="bz2", ).set_index("sample_id") # Creating the dummy phenotype container cls.phenotypes = _DummyPhenotypes() cls.phenotypes.data = data.drop( [col for col in data.columns if col.startswith("snp")], axis=1, ) # Creating a temporary directory cls.tmp_dir = TemporaryDirectory(prefix="genetest_test_linear_") # The plink file prefix cls.plink_prefix = os.path.join(cls.tmp_dir.name, "input") # Permuting the sample to add a bit of randomness new_sample_order = np.random.permutation(data.index) # Creating the BED file with PyPlink(cls.plink_prefix, "w") as bed: for i in range(3): snp = "snp{}".format(i + 1) bed.write_genotypes(data.loc[new_sample_order, snp]) # Creating the BIM file with open(cls.plink_prefix + ".bim", "w") as bim: print(1, "snp1", 0, 1, "B", "A", sep="\t", file=bim) print(1, "snp2", 0, 2, "B", "A", sep="\t", file=bim) print(1, "snp3", 0, 3, "B", "A", sep="\t", file=bim) # Creating the FAM file with open(cls.plink_prefix + ".fam", "w") as fam: for sample in new_sample_order: print(sample, sample, 0, 0, 0, -9, file=fam)
def get_genotypes(rsid, plink_path, sub_in): """ Retrive genotype matrix from variant major format :param rsid: list of rsids :param plink_path: plink-stem path :param sub_in: list of subjects to inlucde :return: genotypematrix """ reader = PyPlink(plink_path) lg.debug('First item of sub_in is %s with %s', sub_in[0], type(sub_in[0])) n = reader.get_nb_samples() genotypematrix = np.zeros((sum(sub_in), len(rsid)), dtype=np.int8) pos_index = 0 for snp, genotype in reader.iter_geno_marker(rsid): if snp not in rsid: continue else: genotypematrix[:, pos_index] = genotype[sub_in] pos_index += 1 reader.close() return genotypematrix
"""Creates probabilities from an additive genotype.""" if genotype == 0: return (1, 0, 0) if genotype == 1: return (0, 1, 0) if genotype == 2: return (0, 0, 1) if genotype == -1: # Lower than normal probabilities return (0.8, 0.1, 0.1) with PyPlink("../plink/btest") as bed, \ open("impute2_test.impute2", "w") as impute2_f, \ open("impute2_test.sample", "w") as impute2_s: # Getting the FAM and the BIM fam = bed.get_fam() bim = bed.get_bim() # Generating the IMPUTE2 file for v, genotypes in bed.iter_geno(): info = bim.loc[v, :] assert v == info.name r = re.search(r"(:dup[0-9]+)$", v) if r: v = v.replace(r.group(1), "")
out = args.out # In[16]: log.info_head("Data Loading") # # parse input files # In[17]: plink = None plink_bim = None plink_fam = None if args.bfile is not None: plink = PyPlink(args.bfile) plink_bim = plink.get_bim() plink_fam = plink.get_fam().astype({ 'fid': str, 'iid': str }).rename( columns={ 'fid': 'FID', 'iid': 'IID', 'father': 'fID', 'mother': 'mID', 'gender': 'sex' }) log.info("{} samples ({} males, {} females) loaded from {}".format( plink_fam.shape[0], (plink_fam['sex'] == 1).sum(),
def convert_beeline(i_filenames, out_dir, locations, other_opts): """Convert beeline report(s) to Plink files. Args: i_filenames (list): a list of file names (str) out_dir (str): the name of the output directory locations (dict): a dictionary from marker ID to genomic location other_opts(argparse.Namespace): the program options """ # The samples that were already seen seen_samples = set() for i_filename in i_filenames: logging.info("Converting '{}'".format(i_filename)) # Getting the output filename o_filename = os.path.splitext(os.path.basename(i_filename))[0] # Opening the file i_file = None if i_filename != "-": i_file = open(i_filename, "r") else: i_file = sys.stdin o_filename = "from_stdin" # The output files prefix = os.path.join(out_dir, o_filename) pedfile = None mapfile = None bedfile = None bimfile = None famfile = None sample_file = None sample_file_end = None sample_file_sep = None if other_opts.o_format == "ped": pedfile = open(prefix + ".ped", "w") mapfile = open(prefix + ".map", "w") sample_file = pedfile sample_file_end = "" sample_file_sep = "\t" elif other_opts.o_format == "bed": bedfile = PyPlink(prefix, "w", "INDIVIDUAL-major") bimfile = open(prefix + ".bim", "w") famfile = open(prefix + ".fam", "w") sample_file = famfile sample_file_end = "\n" sample_file_sep = " " # Reading the file (or STDIN) try: # The number of markers nb_markers = None # Reading the assay information line = i_file.readline().rstrip("\r\n") while line != "[Header]": line = i_file.readline().rstrip("\r\n") while not line.startswith("[Data]"): if line.startswith(other_opts.nb_snps_kw): nb_markers = int(line.rstrip("\r\n").split(",")[-1]) line = i_file.readline() if nb_markers is None: raise ProgramError( "{}: invalid header (missing '{}' value)".format( i_filename, other_opts.nb_snps_kw, ) ) logging.info("There are {:,d} markers".format(nb_markers)) # Reading and checking the header header = { name: i for i, name in enumerate(i_file.readline().rstrip("\r\n").split(",")) } required_columns = ( other_opts.beeline_id, other_opts.beeline_sample, other_opts.beeline_a1, other_opts.beeline_a2, ) for name in required_columns: if name not in header: raise ProgramError( "{}: '{}': missing column".format(i_filename, name), ) # Creating the list that will contain the marker list current_marker_i = 0 all_markers = [None for i in range(nb_markers)] nb_samples = 0 # The genotypes (if in BED format) genotypes = None if other_opts.o_format == "bed": genotypes = [-1 for i in range(nb_markers)] # Reading the first data line line = i_file.readline() row = line.rstrip("\r\n").split(",") # Allele of each markers marker_alleles = {} while line != "": # Getting the marker name and sample id sample = row[header[other_opts.beeline_sample]] if sample in seen_samples: logging.warning("{}: duplicate sample " "found".format(sample)) seen_samples.add(sample) # Logging logging.info("Processing {}".format(sample)) print(sample, sample, "0", "0", "0", "-9", sep=sample_file_sep, end=sample_file_end, file=sample_file) # Reading the rest of the data for this sample current_sample = sample while current_sample == sample: # Checking the marker order marker = row[header[other_opts.beeline_id]] # If the index is > than the length, it might be a # duplicated sample... if current_marker_i == len(all_markers): break if all_markers[current_marker_i] is None: all_markers[current_marker_i] = marker if all_markers[current_marker_i] != marker: raise ProgramError( "{}: marker order is not the same for " "sample '{}'".format(i_filename, sample) ) # Getting the genotype allele_1 = row[header[other_opts.beeline_a1]] allele_2 = row[header[other_opts.beeline_a2]] genotype = "{} {}".format(allele_1, allele_2) if "-" in genotype: genotype = "0 0" if other_opts.o_format == "ped": pedfile.write("\t" + genotype) else: if marker not in locations: raise ProgramError( "{}: no mapping information".format(marker) ) # Is this the first time we have seen this marker? if marker not in marker_alleles: marker_alleles[marker] = locations[marker].alleles # Computing the genotypes genotypes[current_marker_i] = encode_genotype( allele_1, allele_2, marker_alleles[marker], ) # Increasing the current marker current_marker_i += 1 # Reading the next row line = i_file.readline() if line == "": # End of file break # Splitting and current sample row = line.rstrip("\r\n").split(",") current_sample = row[header[other_opts.beeline_sample]] if other_opts.o_format == "ped": pedfile.write("\n") else: bedfile.write_genotypes(genotypes) # If there is only one marker, there is a problem if nb_markers != 1 and current_marker_i == 1: raise ProgramError( "{}: data should be sorted by samples, not by " "markers ('{}' had 1 marker, expecting " "{:,d}".format(i_filename, sample, nb_markers) ) # Are there any missing marker? if current_marker_i != nb_markers: nb_missing = abs(current_marker_i - nb_markers) raise ProgramError( "{}: missing {} marker{} for sample '{}'".format( i_filename, nb_missing, "s" if nb_missing > 1 else "", sample, ) ) current_marker_i = 0 nb_samples += 1 # Closing the output files logging.info("Done writing {:,d} samples".format(nb_samples)) # Printing the map file for marker in all_markers: marker_location = None if marker in locations: marker_location = locations[marker] else: marker_location = _unknown_location logging.warning("{}: no mapping " "information".format(marker)) if other_opts.o_format == "ped": print(marker_location.chrom, marker, "0", marker_location.pos, sep="\t", file=mapfile) else: alleles = { v: k for k, v in marker_alleles[marker].items() } print(marker_location.chrom, marker, "0", marker_location.pos, alleles["B"], alleles["A"], sep="\t", file=bimfile) finally: # Closing the input file i_file.close() # Closing the output files for f in (pedfile, mapfile, bedfile, bimfile, famfile): if f is not None: f.close()
class GeneReader(object): def __init__(self, plink_path: str, pheno_path: str, variant_path: str): lg.debug(""" Loading the following files: Plink: %s Pheno %s Variants: %s """, plink_path, pheno_path, variant_path) assert os.path.isfile(plink_path+'.bed') assert os.path.isfile(variant_path) self.plink_path = plink_path self.variant_path = variant_path self.bfile = PyPlink(self.plink_path) self.bim = self.bfile.get_bim() self.bim['rsid'] = self.bim.index.values self.fam = self.bfile.get_fam() self.n_chrom = self.bim.chrom.nunique() self.variants = self._get_var(self.variant_path) self.genes = self.variants.gene.unique() self.pheno = self._get_pheno(pheno_path) self.bfile.close() def _get_var(self, variant_path: str) -> pd.DataFrame: dat = pd.read_table(variant_path, header=None) lg.debug(dat.head()) nrow, ncol = dat.shape assert ncol == 4 assert nrow > 3 dat.columns = ['chrom', 'pos', 'rsid', 'gene'] n_chrom = dat.chrom.nunique() chromosomes = dat.chrom.unique() n_genes = dat.gene.nunique() lg.info('Got %s genes in variant file', n_genes) lg.info('Got %s variants in variant file', nrow) lg.debug('Chromosomes: %s', n_chrom) chrom_check = [k for k in chromosomes if k in self.bim.chrom.unique()] lg.info('Found %s out of %s chromosomes in bim file', len(chrom_check), self.n_chrom) lg.debug(self.bim.head()) dat = pd.merge(dat, self.bim, on=['pos', 'chrom', 'rsid'], how='inner') n_var = dat.shape[0] lg.info('After merging with the bim file there are %s variants left', n_var) if n_var < nrow: lg.warning('After merging I lost %s variants', nrow - n_var) return dat def _get_pheno(self, pheno_file: str) -> pd.DataFrame: dat = pd.read_table(pheno_file, header=None) nrow, ncol = dat.shape assert ncol >= 3 assert nrow > 1 lg.debug(dat.head()) if ncol == 3: dat.columns = ['fid', 'iid', 'Pheno'] dat['fid'] = dat['fid'].astype(str) dat['iid'] = dat['fid'].astype(str) elif ncol == 6: dat.columns = ['fid', 'iid', 'father', 'mother', 'gender', 'Pheno'] dat['fid'] = dat['fid'].astype(str) dat['iid'] = dat['fid'].astype(str) else: raise ValueError('Need at either a 3 or 6 column file') lg.debug(self.fam.head()) dat = pd.merge(self.fam, dat, on=['fid', 'iid']) self.n = dat.shape[0] lg.info('Using %s out of %s samples', self.n, nrow) if self.n < nrow: lg.warning('%s samples not in fam file', (nrow - self.n)) if self.n < 2: raise AssertionError('Sample size is smaller than 2.') self.case_controls = (dat.Pheno > 0).values lg.info('Found %s cases and %s controls', np.sum(self.case_controls), np.sum(~self.case_controls)) return dat def _read_gene(self, gene: str) -> np.array: temp = self.variants[self.variants.gene == gene] chrom = temp.chrom.unique() assert len(chrom) == 1 lg.debug(temp.head()) marker = temp.rsid.values lg.debug(marker) p = len(marker) assert p > 3 genotype_matrix = np.zeros((self.n, p)) reader = PyPlink(self.plink_path) u = 0 lg.info('Reading %s', gene) for i, g in reader.iter_geno_marker(marker): genotype_matrix[:, u] = g u += 1 lg.debug('Processed variant %s', i) genotype_matrix[genotype_matrix == -1] = 0 reader.close() return genotype_matrix def gene_iterator(self, genes=None) -> np.array: if genes is None: genes = self.genes for gene_name in genes: lg.debug('Getting gene %s', gene_name) yield self._read_gene(gene_name)
import numpy as np import pandas as pd from scipy.stats import pearsonr import matplotlib.pyplot as plt from pyplink import PyPlink from basic_tools import * # # load plink, aa and check integrity # In[2]: plink_KCHIP_HLA_AA_SNP_1000G = PyPlink(plink_KCHIP_HLA_AA_SNP_1000G_path) plink_KCHIP_HLA_AA_SNP_1000G_fam = plink_KCHIP_HLA_AA_SNP_1000G.get_fam( ).astype({ 'fid': str, 'iid': str }).rename(columns={ 'fid': 'FID', 'iid': 'IID' }) plink_KCHIP_HLA_AA_SNP_1000G_bim = plink_KCHIP_HLA_AA_SNP_1000G.get_bim() # In[3]: grm_path = 'data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm' # In[4]:
for i in {00..10};do python 5_1_association.py $i 1 0;done for i in {11..20};do python 5_1_association.py $i 1 0;done for i in {21..30};do python 5_1_association.py $i 1 0;done for i in {31..40};do python 5_1_association.py $i 1 0;done for i in {41..50};do python 5_1_association.py $i 1 0;done for i in {51..60};do python 5_1_association.py $i 1 0;done for i in {61..70};do python 5_1_association.py $i 1 0;done for i in {71..80};do python 5_1_association.py $i 1 0;done for i in {81..90};do python 5_1_association.py $i 1 0;done for i in {91..97};do python 5_1_association.py $i 1 0;done """ # In[2]: plink_KCHIP_HLA_AA_SNP_1000G = PyPlink(plink_KCHIP_HLA_AA_SNP_1000G_path) plink_KCHIP_HLA_AA_SNP_1000G_fam = plink_KCHIP_HLA_AA_SNP_1000G.get_fam( ).astype({ 'fid': str, 'iid': str }).rename(columns={ 'fid': 'FID', 'iid': 'IID' }) plink_KCHIP_HLA_AA_SNP_1000G_bim = plink_KCHIP_HLA_AA_SNP_1000G.get_bim() # In[3]: #len(binary_continuous_traits) # In[4]:
class PlinkReader(GenotypesReader): def __init__(self, prefix): """Binary plink file reader. Args: prefix (str): the prefix of the Plink binary files. """ self.bed = PyPlink(prefix) self.bim = self.bed.get_bim() self.fam = self.bed.get_fam() # Identify all multi-allelics. self.bim["multiallelic"] = False self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False), "multiallelic"] = True # We want to set the index for the FAM file try: self.fam = self.fam.set_index("iid", verify_integrity=True) except ValueError: logger.info( "Setting the index as 'fid_iid' because the individual IDs " "are not unique.") self.fam["fid_iid"] = [ "{fid}_{iid}".format(fid=fid, iid=iid) for fid, iid in zip(self.fam.fid, self.fam.iid) ] self.fam = self.fam.set_index("fid_iid", verify_integrity=True) def close(self): self.bed.close() def get_variant_genotypes(self, variant): """Get the genotypes from a well formed variant instance. Args: marker (Variant): A Variant instance. Returns: A list of Genotypes instance containing a pointer to the variant as well as a vector of encoded genotypes. Note ==== If the sample IDs are not unique, the index is changed to be the sample family ID and individual ID (i.e. fid_iid). """ # Find the variant in the bim. plink_chrom = CHROM_STR_TO_INT[variant.chrom] info = self.bim.loc[(self.bim.chrom == plink_chrom) & (self.bim.pos == variant.pos), :] if info.shape[0] == 0: return [] elif info.shape[0] == 1: return self._get_biallelic_variant(variant, info) else: return self._get_multialleic_variant(variant, info) def _get_biallelic_variant(self, variant, info, _check_alleles=True): # From 1.3.2 onwards, PyPlink sets unique names. info = info.iloc[0, :] variant_alleles = variant._encode_alleles([info.a2, info.a1]) if (_check_alleles and variant_alleles != variant.alleles): # Variant with requested alleles is unavailable. return [] geno = self._normalize_missing(self.bed.get_geno_marker(info.name)) return [Genotypes(variant, geno, info.a2, info.a1, False)] def _get_multialleic_variant(self, variant, info): # Check if alleles are specified. out = [] if variant.alleles is None: # If no alleles are specified, we return all the possible # bi-allelic variats. for name, row in info.iterrows(): geno = self.bed.get_geno_marker(name) geno = self._normalize_missing(geno) out.append(Genotypes(variant, geno, row.a2, row.a1, True)) else: # Find the requested alleles. for name, row in info.iterrows(): row_alleles = set(Variant._encode_alleles((row.a1, row.a2))) if row_alleles.issubset(variant.alleles_set): out.extend( self._get_biallelic_variant(variant, info.loc[[name], :], _check_alleles=False)) return out def iter_genotypes(self): """Iterates on available markers. Returns: Genotypes instances. Note ==== If the sample IDs are not unique, the index is changed to be the sample family ID and individual ID (i.e. fid_iid). """ # Iterating over all markers for i, (_, genotypes) in enumerate(self.bed.iter_geno()): info = self.bim.iloc[i, :] yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(genotypes), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic) def iter_variants(self): """Iterate over marker information.""" for idx, row in self.bim.iterrows(): yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos, [row.a1, row.a2]) def get_variants_in_region(self, chrom, start, end): """Iterate over variants in a region.""" bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom]) & (start <= self.bim["pos"]) & (self.bim["pos"] <= end)] for i, g in enumerate(self.bed.iter_geno_marker(bim.index)): info = bim.iloc[i, :] name, geno = g yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(geno), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic) def get_variant_by_name(self, name): """Get the genotype of a marker using it's name. Args: name (str): The name of the marker. Returns: list: A list of Genotypes (only one for PyPlink, see note below). Note ==== From PyPlink version 1.3.2 and onwards, each name is unique in the dataset. Hence, we can use the 'get_geno_marker' function and be sure only one variant is returned. """ # From 1.3.2 onwards, PyPlink sets unique names. # Getting the genotypes try: geno, i = self.bed.get_geno_marker(name, return_index=True) except ValueError: if name in self.bed.get_duplicated_markers(): # The variant is a duplicated one, so we go through all the # variants with the same name and the :dupx suffix return [ self.get_variant_by_name(dup_name).pop() for dup_name in self.bed.get_duplicated_markers()[name] ] else: # The variant is not in the BIM file, so we return an empty # list logger.warning("Variant {} was not found".format(name)) return [] else: info = self.bim.iloc[i, :] return [ Genotypes( Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(geno), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic, ) ] def get_number_samples(self): """Returns the number of samples. Returns: int: The number of samples. """ return self.bed.get_nb_samples() def get_number_variants(self): """Returns the number of markers. Returns: int: The number of markers. """ return self.bed.get_nb_markers() def get_samples(self): return list(self.fam.index) @staticmethod def _normalize_missing(g): """Normalize a plink genotype vector.""" g = g.astype(float) g[g == -1.0] = np.nan return g
def extract_markers(fn, to_extract, out_prefix, out_format, prob_t, is_long): """Extracts according to names. Args: fn (str): the name of the input file to_extract (set): the list of markers to extract for each input file out_prefix (str): the output prefix out_format (list): the output format(s) prob_t (float): the probability threshold is_long (bool): True if format needs to be long """ # The output files (probabilities) o_files = { suffix: open(out_prefix + "." + suffix, "w") for suffix in out_format if suffix not in {"bed"} } # If there is the 'bed' format, we actually need pyplink if "bed" in out_format: o_files["bed"] = ( PyPlink(out_prefix, "w"), open(out_prefix + ".bim", "w"), ) # Creating a fam (if bed) samples = get_samples(get_file_prefix(fn) + ".sample") sample_names = [ "{}/{}".format(id_1, id_2) for id_1, id_2 in zip(samples.ID_1, samples.ID_2) ] # Writing the header (for dosage) if "dosage" in o_files: if is_long: print("fid", "iid", "chrom", "pos", "name", "minor", "major", "dosage", sep="\t", file=o_files["dosage"]) else: print("chrom", "pos", "name", "minor", "major", *sample_names, sep="\t", file=o_files["dosage"]) # Writing the header (for calls) if "calls" in o_files: if is_long: print("fid", "iid", "chrom", "name", "cm", "pos", "call", sep="\t", file=o_files["calls"]) else: print("chrom", "name", "cm", "pos", *sample_names, sep="\t", file=o_files["calls"]) # Extracted positions all_extracted = set() # Reading the impute2 file extracted = set() # Finding the name of the file containing the index file_index = index.get_index(fn, cols=[0, 1, 2], names=["chrom", "name", "pos"], sep=" ") # Keeping only required values from the index file_index = file_index[file_index.name.isin(to_extract)] # Getting all the markers value logging.info("Extracting {:,d} markers".format(len(file_index))) with index.get_open_func(fn)(fn, "r") as i_file: for seek_value in file_index.seek.values: # Seeking i_file.seek(int(seek_value)) # Reading the line line = i_file.readline() row = line.rstrip("\n").split(" ") # The marker name name = row[1] # Printing the data print_data(o_files, prob_t, samples.ID_1, samples.ID_2, line=line, row=row, is_long=is_long) # Saving statistics extracted.add(name) logging.info("Extracted {:,d} markers".format(len(extracted))) if len(to_extract - extracted) > 0: logging.warning("Missing {:,d} " "markers".format(len(to_extract - extracted))) # Keeping track of what has been extracted all_extracted |= extracted # Extracting the companion files (if impute2 and files are present) if "impute2" in o_files: extract_companion_files( i_prefix=get_file_prefix(fn), to_extract=to_extract, o_prefix=out_prefix, ) # Writing the FAM file if bed if "bed" in o_files: cols = ["ID_1", "ID_2", "father", "mother", "sex", "plink_pheno"] samples[cols].to_csv(out_prefix + ".fam", sep=" ", index=False, header=False) # Closing the files for o_format, o_file in o_files.items(): if o_format == "bed": o_file[0].close() o_file[1].close() else: o_file.close() # Extraction complete logging.info("Extraction of {:,d} markers " "completed".format(len(all_extracted)))
def test_smaller_intersect(self): """Tests when the sample intersect is smaller between containers.""" # Choosing 10 samples to exclude from the dataset to_exclude = np.random.choice(self.data.index, 10, replace=False) # Removing 5 samples from the phenotypes phenotypes = _DummyPhenotypes() phenotypes.data = self.data.drop(to_exclude[:5], axis=0) # Removing the next 5 for the genotypes plink_prefix = self.plink_prefix + "_less" geno_data = self.data.drop(to_exclude[5:], axis=0) with PyPlink(plink_prefix, "w") as bed: bed.write_genotypes(geno_data.snp) # Creating the BIM file with open(plink_prefix + ".bim", "w") as bim: print(1, "snp", 0, 1, "B", "A", sep="\t", file=bim) # Creating the FAM file with open(plink_prefix + ".fam", "w") as fam: for sample in geno_data.index: print(sample, sample, 0, 0, 0, -9, file=fam) # Creating the model specification predictors = [ spec.genotypes.snp, spec.phenotypes.var1, spec.phenotypes.var2 ] modelspec = spec.ModelSpec( outcome=spec.phenotypes.pheno, predictors=predictors, test="linear", ) # Gathering the observed matrix with parsers["plink"](plink_prefix) as genotypes: matrix = modelspec.create_data_matrix(phenotypes, genotypes) # Subset of the new data data = self.data.drop(to_exclude, axis=0) # Checking the shape of the matrix self.assertEqual((data.shape[0], 5), matrix.shape, "The observed matrix is not of the right shape") # Checking the intercept self.assertEqual([1], matrix.intercept.unique().tolist(), "The intercept is not as expected") # Checking the outcome outcome_col = spec.phenotypes.pheno.id outcomes = matrix.loc[data.index, outcome_col] self.assertTrue(outcomes.equals(data.pheno), "The outcomes are not as expected") # Checking the predictors translations = modelspec.get_translations() for predictor in predictors: # Getting the name of the predictor name = translations[predictor.id] # Comparing the values np.testing.assert_array_equal( matrix.loc[data.index, predictor.id].values, data[name].values, err_msg="The predictor '{}' is not as expected".format(name), )
def prepare_simulation(sample_size, n_geno=1000, seed=321, exp_var=0.5, n_causal=10, threads=8): run_id = uuid.uuid4() run_dir = join('runs', f'run_{run_id}') os.makedirs(run_dir, exist_ok=True) print(f'creating dummy data in {run_dir}...') # create indiv file indiv_path = join(run_dir, 'indiv.txt') start = 10000000 iid = pd.DataFrame([[i, i] for i in range(start, start + sample_size)], columns=['FID', 'IID']) iid.to_csv(indiv_path, sep=' ', header=False, index=False) # create covariate file cov_path = join(run_dir, 'covariates.txt') age = np.random.randint(30, 81, sample_size) sex = np.random.randint(0, 2, sample_size) iid['age'] = age iid['sex'] = sex iid.to_csv(cov_path, sep=' ', header=True, index=False) # create toml file config_path = join(run_dir, 'config.toml') config = { # general parameters 'seeds': [seed], 'exp_vars': [exp_var], 'n_causal': n_causal, 'sample_sizes': [sample_size], 'wdir': run_dir, # genetic parameters 'bed': join(run_dir, 'chr%s'), 'fam': join(run_dir, 'chr1.fam'), 'normalize': True, 'indiv': indiv_path, 'mid_buffer': 0, # GAN parameters 'stylegan2_models': '../models/', 'stylegan2_name': 'stylegan2_healthy', 'psi': 0.4, 'diff_noise': False, 'mult_scale': 2.0, # feature condensation parameters 'n_pcs': 100, 'size': 448, 'tfms': 'tta', 'model': 'resnet50', 'pretraining': 'imagenet', 'spatial': 'mean', 'layers': ['L4'], # GWAS parameters 'first_pc': 0, 'last_pc': 9, 'cov': cov_path, 'cov_columns': ['sex'], 'qcov_columns': ['age'], 'threads': threads, 'bolt': '../BOLT-LMM_v2.3.4/', 'ref_map': "", 'ldscores': "", } toml.dump(config, open(config_path, 'w')) # create genetic data iid['father'] = 0 iid['mother'] = 0 iid['sex'] += 1 iid['pheno'] = -9 fam_df = iid[['FID', 'IID', 'father', 'mother', 'sex', 'pheno']] ld_df = pd.read_csv(join(BOLT_DIR, 'tables', 'LDSCORE.1000G_EUR.tab.gz'), sep='\t') ld_df = ld_df.sample(n_geno).sort_values(['CHR', 'BP']) G = (np.random.rand(n_geno, sample_size, 2) > ld_df.MAF.values.reshape( -1, 1, 1)).sum(-1) last_ind = 0 for chromo, chromo_df in tqdm(ld_df.groupby('CHR')): gen_path = join(run_dir, 'chr%d' % chromo) n_chromo = len(chromo_df) chromo_G = G[last_ind:(last_ind + n_chromo)] last_ind += n_chromo with PyPlink(gen_path, 'w') as bed: for g in chromo_G: bed.write_genotypes(g) fam_df.to_csv(gen_path + '.fam', sep='\t', header=False, index=False) bim_df = pd.DataFrame( np.array([ n_chromo * [chromo], chromo_df.SNP.values, n_chromo * [0], chromo_df.BP.values, n_chromo * ['C'], n_chromo * ['A'], ]).T) bim_df.to_csv(gen_path + '.bim', sep='\t', header=False, index=False)
class ReadPlink(object): """Reads plink files and allows random sampling""" def __init__(self, plinkstem): """ :param plinkstem: plink stem file path """ self._plinkstem = plinkstem self._bim_path = os.path.basename(self._plinkstem)+'.bim' self._bed_path = os.path.basename(self._plinkstem)+'.bed' self._fam_path = os.path.basename(self._plinkstem)+'.fam' self.plinkfile = PyPlink(self._plinkstem) self.fam = self.plinkfile.get_fam() self.bim = self.plinkfile.get_bim() self.N = self.fam.shape[0] self.P = self.bim.shape[0] self._sample_subjects = None self._sample_variants = None def sample(self, n, p, write_disk=False): """Samples from a plink file with random SNPs and subjects Currently pandas_plink does not support fancy indexing, hence sample will load the genotypes of all subjects before randomly sample subjects IDs. :param n: number of subjects to sample :param p: number of variants to sample :param write_disk: bool, write to disk a list of variants :returns: a numpy matrix of size n*p """ self._sample_subjects = np.random.choice(self.fam.index.values, n, replace=True) self._sample_variants = np.random.choice(self.bim.index.values, p) if write_disk: self.bim.iloc[self._sample_variants].to_csv('sampled_variants.csv') self.fam.iloc[self._sample_subjects].to_csv('sampled_subjects.csv') genotypematrix = self.read_bed(self._sample_variants, self._sample_subjects) return genotypematrix def read_bed(self, marker=None, subjects=None): """read bed file :param marker: list of SNPs :param subjects: list of subjects :returns: genotypematrix of size subjects*marker """ if marker is None: P_size = self.P marker = self.bim.index.values else: P_size = len(marker) if subjects is None: N_size = self.N subjects = self.fam.index.values else: N_size = len(subjects) genotypematrix = np.zeros((N_size, P_size), dtype=np.int8) j = 0 for m, g in self.plinkfile.iter_geno_marker(marker): genotypematrix[:,j] = g[subjects] j += 1 genotypematrix[genotypematrix < 0] = 0 return genotypematrix
args = parser.parse_args() top_variant = None top_p = 1.0 with open(args.assoc, 'rb') as ihandle: rdr = csv.DictReader(ihandle, delimiter='\t') for row in rdr: p = float(row['p_lrt']) if p < top_p: top_p = p top_variant = row['rs'] if top_variant is None: raise Exception('No top variant chosen') print top_variant p = PyPlink(args.plink) g = None for marker, genotypes in p: if marker == top_variant: s = sum(x for x in genotypes.tolist() if x != -1) c = sum(1. for x in genotypes.tolist() if x != -1) meang = s / c g = [str(x) if x != -1 else str(meang) for x in genotypes] break if g is None: raise Exception('Could not find top variant in plink file') if args.previous is None or args.previous == 'NONE': previous = [['1'] for x in g] else: previous = []
bootstrapped_list = [ random.randint(0, number_markers - 1) for _ in range(number_markers) ] bootstrapped_list.sort() return bootstrapped_list ### parsing command line arguments parser = argparse.ArgumentParser() parser.add_argument('bfile', help='plink binary (bed, bim, fam) dataset prefix') parser.add_argument('nboot', help='number of bootstrap replicates', type=int) args = parser.parse_args() ### perform plink bootstrap replicates with PyPlink(args.bfile) as bed: bim = bed.get_bim() # returns pandas.DataFrame of bim file nmarkers = bed.get_nb_markers() nsamples = bed.get_nb_samples() print("### Loaded {0} markers and {1} samples...".format( nmarkers, nsamples)) for rep in range(args.nboot): print("### Performing bootstrap replicate {0} of {1}...".format( rep + 1, args.nboot)) rep_list = get_resampled_loci( nmarkers) # gets a list of resampled markers rep_basename = "rep" + str(rep) with PyPlink(rep_basename,
MAX_MAF = max(mafs) if mafs else 1 MAX_MAC = max(macs) if macs else 9000000000 OUT_QUALIFIED_VARIANTS = True if args['--ov'] else False if len(mafs) == 0 and len(macs) == 0: sys.stderr.write( "At least one should be open: '--alt-frqs' and/or '--alt-acs'\n") sys.exit(-1) # Open and work on VCF file. # https://lemieuxl.github.io/pyplink/ # API: https://lemieuxl.github.io/pyplink/pyplink.html # API demo: https://nbviewer.jupyter.org/github/lemieuxl/pyplink/blob/master/demo/PyPlink%20Demo.ipynb from pyplink import PyPlink # Getting the BED BIM FAM bed = PyPlink(plink_prefix) bim = bed.get_bim() fam = bed.get_fam() dup = bed.get_duplicated_markers() if dup: sys.stderr.write('ERROR: Duplicate markers found as above!!!\n') sys.exit(-1) snp_sets = set(bim.index) select_samples = [x in samples for x in fam.loc[:, 'iid'] ] # True/false array for sample we want to keep. # print(vcf.samples) out = "#CHROM BEGIN END MARKER_ID NUM_ALL_VARS NUM_PASS_VARS NUM_SING_VARS MAF/MAC_CUT".split( ) sys.stdout.write(