def load_bed(geno_temp, chromo=1, indiv=None, mid_buffer=2e6): '''load bed-file and split in effect and null SNPs # Parameters: chromo (int): chromosome number indiv (None or np.ndarray): if None, use all individuals in bed-file if np.array, use elements as indivduals mid_buffer (int): number of bp up- and downstream of center of chromosome to leave out # Returns: G (pd.DataFrame): DataFrame with one indiv per row, one SNP per column eff_snps, null_snps (np.ndarray): array of ints with position of effect- and null-SNPs rsid (np.ndarray) array of str with rsid names of SNPs in G ''' print(chromo) bed = PyPlink(geno_temp % chromo) print(bed.get_nb_markers()) fam = bed.get_fam() if indiv is None: indiv = fam.iid.astype(int) ind = np.isin(fam.iid.astype(int), indiv) indiv = fam.loc[ind, 'iid'].astype(int).values G = [] rsids = [] removed = 0 for g in tqdm(bed.iter_geno(), total=bed.get_nb_markers()): rs = g[0] gen = g[1][ind] g_ind = gen == -1 if g_ind.mean() < 0.1: gen[g_ind] = gen[~g_ind].mean() G.append(gen) rsids.append(rs) else: removed += 1 print(f'removed {removed} SNPs due to missing>10%') G = pd.DataFrame( np.array(G).T, index=indiv, columns=['c%d:%d' % (chromo, x) for x in range(len(rsids))], ) bim = bed.get_bim().loc[rsids] mid = bim.pos.min() + (bim.pos.max() - bim.pos.min()) // 2 eff_snps = np.where(bim.pos < mid - mid_buffer)[0] null_snps = np.where(bim.pos > mid + mid_buffer)[0] return G, eff_snps, null_snps, np.array(rsids)
# In[17]: plink = None plink_bim = None plink_fam = None if args.bfile is not None: plink = PyPlink(args.bfile) plink_bim = plink.get_bim() plink_fam = plink.get_fam().astype({ 'fid': str, 'iid': str }).rename( columns={ 'fid': 'FID', 'iid': 'IID', 'father': 'fID', 'mother': 'mID', 'gender': 'sex' }) log.info("{} samples ({} males, {} females) loaded from {}".format( plink_fam.shape[0], (plink_fam['sex'] == 1).sum(), (plink_fam['sex'] == 2).sum(), args.bfile)) log.info("{} unphased variants loaded from {}".format( plink_bim.shape[0], args.bfile)) # In[18]: phased_FID_list = None
import matplotlib.pyplot as plt from pyplink import PyPlink from basic_tools import * # # load plink, aa and check integrity # In[2]: plink_KCHIP_HLA_AA_SNP_1000G = PyPlink(plink_KCHIP_HLA_AA_SNP_1000G_path) plink_KCHIP_HLA_AA_SNP_1000G_fam = plink_KCHIP_HLA_AA_SNP_1000G.get_fam( ).astype({ 'fid': str, 'iid': str }).rename(columns={ 'fid': 'FID', 'iid': 'IID' }) plink_KCHIP_HLA_AA_SNP_1000G_bim = plink_KCHIP_HLA_AA_SNP_1000G.get_bim() # In[3]: grm_path = 'data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm' # In[4]: #final_plink_aa_grm_path # # load phenotype and check integrity
class PlinkReader(GenotypesReader): def __init__(self, prefix): """Binary plink file reader. Args: prefix (str): the prefix of the Plink binary files. """ self.bed = PyPlink(prefix) self.bim = self.bed.get_bim() self.fam = self.bed.get_fam() # Identify all multi-allelics. self.bim["multiallelic"] = False self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False), "multiallelic"] = True # We want to set the index for the FAM file try: self.fam = self.fam.set_index("iid", verify_integrity=True) except ValueError: logger.info( "Setting the index as 'fid_iid' because the individual IDs " "are not unique.") self.fam["fid_iid"] = [ "{fid}_{iid}".format(fid=fid, iid=iid) for fid, iid in zip(self.fam.fid, self.fam.iid) ] self.fam = self.fam.set_index("fid_iid", verify_integrity=True) def close(self): self.bed.close() def get_variant_genotypes(self, variant): """Get the genotypes from a well formed variant instance. Args: marker (Variant): A Variant instance. Returns: A list of Genotypes instance containing a pointer to the variant as well as a vector of encoded genotypes. Note ==== If the sample IDs are not unique, the index is changed to be the sample family ID and individual ID (i.e. fid_iid). """ # Find the variant in the bim. plink_chrom = CHROM_STR_TO_INT[variant.chrom] info = self.bim.loc[(self.bim.chrom == plink_chrom) & (self.bim.pos == variant.pos), :] if info.shape[0] == 0: return [] elif info.shape[0] == 1: return self._get_biallelic_variant(variant, info) else: return self._get_multialleic_variant(variant, info) def _get_biallelic_variant(self, variant, info, _check_alleles=True): # From 1.3.2 onwards, PyPlink sets unique names. info = info.iloc[0, :] variant_alleles = variant._encode_alleles([info.a2, info.a1]) if (_check_alleles and variant_alleles != variant.alleles): # Variant with requested alleles is unavailable. return [] geno = self._normalize_missing(self.bed.get_geno_marker(info.name)) return [Genotypes(variant, geno, info.a2, info.a1, False)] def _get_multialleic_variant(self, variant, info): # Check if alleles are specified. out = [] if variant.alleles is None: # If no alleles are specified, we return all the possible # bi-allelic variats. for name, row in info.iterrows(): geno = self.bed.get_geno_marker(name) geno = self._normalize_missing(geno) out.append(Genotypes(variant, geno, row.a2, row.a1, True)) else: # Find the requested alleles. for name, row in info.iterrows(): row_alleles = set(Variant._encode_alleles((row.a1, row.a2))) if row_alleles.issubset(variant.alleles_set): out.extend( self._get_biallelic_variant(variant, info.loc[[name], :], _check_alleles=False)) return out def iter_genotypes(self): """Iterates on available markers. Returns: Genotypes instances. Note ==== If the sample IDs are not unique, the index is changed to be the sample family ID and individual ID (i.e. fid_iid). """ # Iterating over all markers for i, (_, genotypes) in enumerate(self.bed.iter_geno()): info = self.bim.iloc[i, :] yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(genotypes), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic) def iter_variants(self): """Iterate over marker information.""" for idx, row in self.bim.iterrows(): yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos, [row.a1, row.a2]) def get_variants_in_region(self, chrom, start, end): """Iterate over variants in a region.""" bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom]) & (start <= self.bim["pos"]) & (self.bim["pos"] <= end)] for i, g in enumerate(self.bed.iter_geno_marker(bim.index)): info = bim.iloc[i, :] name, geno = g yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(geno), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic) def get_variant_by_name(self, name): """Get the genotype of a marker using it's name. Args: name (str): The name of the marker. Returns: list: A list of Genotypes (only one for PyPlink, see note below). Note ==== From PyPlink version 1.3.2 and onwards, each name is unique in the dataset. Hence, we can use the 'get_geno_marker' function and be sure only one variant is returned. """ # From 1.3.2 onwards, PyPlink sets unique names. # Getting the genotypes try: geno, i = self.bed.get_geno_marker(name, return_index=True) except ValueError: if name in self.bed.get_duplicated_markers(): # The variant is a duplicated one, so we go through all the # variants with the same name and the :dupx suffix return [ self.get_variant_by_name(dup_name).pop() for dup_name in self.bed.get_duplicated_markers()[name] ] else: # The variant is not in the BIM file, so we return an empty # list logger.warning("Variant {} was not found".format(name)) return [] else: info = self.bim.iloc[i, :] return [ Genotypes( Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(geno), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic, ) ] def get_number_samples(self): """Returns the number of samples. Returns: int: The number of samples. """ return self.bed.get_nb_samples() def get_number_variants(self): """Returns the number of markers. Returns: int: The number of markers. """ return self.bed.get_nb_markers() def get_samples(self): return list(self.fam.index) @staticmethod def _normalize_missing(g): """Normalize a plink genotype vector.""" g = g.astype(float) g[g == -1.0] = np.nan return g
class ReadPlink(object): """Reads plink files and allows random sampling""" def __init__(self, plinkstem): """ :param plinkstem: plink stem file path """ self._plinkstem = plinkstem self._bim_path = os.path.basename(self._plinkstem)+'.bim' self._bed_path = os.path.basename(self._plinkstem)+'.bed' self._fam_path = os.path.basename(self._plinkstem)+'.fam' self.plinkfile = PyPlink(self._plinkstem) self.fam = self.plinkfile.get_fam() self.bim = self.plinkfile.get_bim() self.N = self.fam.shape[0] self.P = self.bim.shape[0] self._sample_subjects = None self._sample_variants = None def sample(self, n, p, write_disk=False): """Samples from a plink file with random SNPs and subjects Currently pandas_plink does not support fancy indexing, hence sample will load the genotypes of all subjects before randomly sample subjects IDs. :param n: number of subjects to sample :param p: number of variants to sample :param write_disk: bool, write to disk a list of variants :returns: a numpy matrix of size n*p """ self._sample_subjects = np.random.choice(self.fam.index.values, n, replace=True) self._sample_variants = np.random.choice(self.bim.index.values, p) if write_disk: self.bim.iloc[self._sample_variants].to_csv('sampled_variants.csv') self.fam.iloc[self._sample_subjects].to_csv('sampled_subjects.csv') genotypematrix = self.read_bed(self._sample_variants, self._sample_subjects) return genotypematrix def read_bed(self, marker=None, subjects=None): """read bed file :param marker: list of SNPs :param subjects: list of subjects :returns: genotypematrix of size subjects*marker """ if marker is None: P_size = self.P marker = self.bim.index.values else: P_size = len(marker) if subjects is None: N_size = self.N subjects = self.fam.index.values else: N_size = len(subjects) genotypematrix = np.zeros((N_size, P_size), dtype=np.int8) j = 0 for m, g in self.plinkfile.iter_geno_marker(marker): genotypematrix[:,j] = g[subjects] j += 1 genotypematrix[genotypematrix < 0] = 0 return genotypematrix
OUT_QUALIFIED_VARIANTS = True if args['--ov'] else False if len(mafs) == 0 and len(macs) == 0: sys.stderr.write( "At least one should be open: '--alt-frqs' and/or '--alt-acs'\n") sys.exit(-1) # Open and work on VCF file. # https://lemieuxl.github.io/pyplink/ # API: https://lemieuxl.github.io/pyplink/pyplink.html # API demo: https://nbviewer.jupyter.org/github/lemieuxl/pyplink/blob/master/demo/PyPlink%20Demo.ipynb from pyplink import PyPlink # Getting the BED BIM FAM bed = PyPlink(plink_prefix) bim = bed.get_bim() fam = bed.get_fam() dup = bed.get_duplicated_markers() if dup: sys.stderr.write('ERROR: Duplicate markers found as above!!!\n') sys.exit(-1) snp_sets = set(bim.index) select_samples = [x in samples for x in fam.loc[:, 'iid'] ] # True/false array for sample we want to keep. # print(vcf.samples) out = "#CHROM BEGIN END MARKER_ID NUM_ALL_VARS NUM_PASS_VARS NUM_SING_VARS MAF/MAC_CUT".split( ) sys.stdout.write( '%s\t%s\n' % ('\t'.join(out), '\t'.join(fam.loc[select_samples]['iid'])))
class GeneReader(object): def __init__(self, plink_path: str, pheno_path: str, variant_path: str): lg.debug(""" Loading the following files: Plink: %s Pheno %s Variants: %s """, plink_path, pheno_path, variant_path) assert os.path.isfile(plink_path+'.bed') assert os.path.isfile(variant_path) self.plink_path = plink_path self.variant_path = variant_path self.bfile = PyPlink(self.plink_path) self.bim = self.bfile.get_bim() self.bim['rsid'] = self.bim.index.values self.fam = self.bfile.get_fam() self.n_chrom = self.bim.chrom.nunique() self.variants = self._get_var(self.variant_path) self.genes = self.variants.gene.unique() self.pheno = self._get_pheno(pheno_path) self.bfile.close() def _get_var(self, variant_path: str) -> pd.DataFrame: dat = pd.read_table(variant_path, header=None) lg.debug(dat.head()) nrow, ncol = dat.shape assert ncol == 4 assert nrow > 3 dat.columns = ['chrom', 'pos', 'rsid', 'gene'] n_chrom = dat.chrom.nunique() chromosomes = dat.chrom.unique() n_genes = dat.gene.nunique() lg.info('Got %s genes in variant file', n_genes) lg.info('Got %s variants in variant file', nrow) lg.debug('Chromosomes: %s', n_chrom) chrom_check = [k for k in chromosomes if k in self.bim.chrom.unique()] lg.info('Found %s out of %s chromosomes in bim file', len(chrom_check), self.n_chrom) lg.debug(self.bim.head()) dat = pd.merge(dat, self.bim, on=['pos', 'chrom', 'rsid'], how='inner') n_var = dat.shape[0] lg.info('After merging with the bim file there are %s variants left', n_var) if n_var < nrow: lg.warning('After merging I lost %s variants', nrow - n_var) return dat def _get_pheno(self, pheno_file: str) -> pd.DataFrame: dat = pd.read_table(pheno_file, header=None) nrow, ncol = dat.shape assert ncol >= 3 assert nrow > 1 lg.debug(dat.head()) if ncol == 3: dat.columns = ['fid', 'iid', 'Pheno'] dat['fid'] = dat['fid'].astype(str) dat['iid'] = dat['fid'].astype(str) elif ncol == 6: dat.columns = ['fid', 'iid', 'father', 'mother', 'gender', 'Pheno'] dat['fid'] = dat['fid'].astype(str) dat['iid'] = dat['fid'].astype(str) else: raise ValueError('Need at either a 3 or 6 column file') lg.debug(self.fam.head()) dat = pd.merge(self.fam, dat, on=['fid', 'iid']) self.n = dat.shape[0] lg.info('Using %s out of %s samples', self.n, nrow) if self.n < nrow: lg.warning('%s samples not in fam file', (nrow - self.n)) if self.n < 2: raise AssertionError('Sample size is smaller than 2.') self.case_controls = (dat.Pheno > 0).values lg.info('Found %s cases and %s controls', np.sum(self.case_controls), np.sum(~self.case_controls)) return dat def _read_gene(self, gene: str) -> np.array: temp = self.variants[self.variants.gene == gene] chrom = temp.chrom.unique() assert len(chrom) == 1 lg.debug(temp.head()) marker = temp.rsid.values lg.debug(marker) p = len(marker) assert p > 3 genotype_matrix = np.zeros((self.n, p)) reader = PyPlink(self.plink_path) u = 0 lg.info('Reading %s', gene) for i, g in reader.iter_geno_marker(marker): genotype_matrix[:, u] = g u += 1 lg.debug('Processed variant %s', i) genotype_matrix[genotype_matrix == -1] = 0 reader.close() return genotype_matrix def gene_iterator(self, genes=None) -> np.array: if genes is None: genes = self.genes for gene_name in genes: lg.debug('Getting gene %s', gene_name) yield self._read_gene(gene_name)