def _read_gene(self, gene: str) -> np.array: temp = self.variants[self.variants.gene == gene] chrom = temp.chrom.unique() assert len(chrom) == 1 lg.debug(temp.head()) marker = temp.rsid.values lg.debug(marker) p = len(marker) assert p > 3 genotype_matrix = np.zeros((self.n, p)) reader = PyPlink(self.plink_path) u = 0 lg.info('Reading %s', gene) for i, g in reader.iter_geno_marker(marker): genotype_matrix[:, u] = g u += 1 lg.debug('Processed variant %s', i) genotype_matrix[genotype_matrix == -1] = 0 reader.close() return genotype_matrix
def get_genotypes(rsid, plink_path, sub_in): """ Retrive genotype matrix from variant major format :param rsid: list of rsids :param plink_path: plink-stem path :param sub_in: list of subjects to inlucde :return: genotypematrix """ reader = PyPlink(plink_path) lg.debug('First item of sub_in is %s with %s', sub_in[0], type(sub_in[0])) n = reader.get_nb_samples() genotypematrix = np.zeros((sum(sub_in), len(rsid)), dtype=np.int8) pos_index = 0 for snp, genotype in reader.iter_geno_marker(rsid): if snp not in rsid: continue else: genotypematrix[:, pos_index] = genotype[sub_in] pos_index += 1 reader.close() return genotypematrix
class PlinkReader(GenotypesReader): def __init__(self, prefix): """Binary plink file reader. Args: prefix (str): the prefix of the Plink binary files. """ self.bed = PyPlink(prefix) self.bim = self.bed.get_bim() self.fam = self.bed.get_fam() # Identify all multi-allelics. self.bim["multiallelic"] = False self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False), "multiallelic"] = True # We want to set the index for the FAM file try: self.fam = self.fam.set_index("iid", verify_integrity=True) except ValueError: logger.info( "Setting the index as 'fid_iid' because the individual IDs " "are not unique.") self.fam["fid_iid"] = [ "{fid}_{iid}".format(fid=fid, iid=iid) for fid, iid in zip(self.fam.fid, self.fam.iid) ] self.fam = self.fam.set_index("fid_iid", verify_integrity=True) def close(self): self.bed.close() def get_variant_genotypes(self, variant): """Get the genotypes from a well formed variant instance. Args: marker (Variant): A Variant instance. Returns: A list of Genotypes instance containing a pointer to the variant as well as a vector of encoded genotypes. Note ==== If the sample IDs are not unique, the index is changed to be the sample family ID and individual ID (i.e. fid_iid). """ # Find the variant in the bim. plink_chrom = CHROM_STR_TO_INT[variant.chrom] info = self.bim.loc[(self.bim.chrom == plink_chrom) & (self.bim.pos == variant.pos), :] if info.shape[0] == 0: return [] elif info.shape[0] == 1: return self._get_biallelic_variant(variant, info) else: return self._get_multialleic_variant(variant, info) def _get_biallelic_variant(self, variant, info, _check_alleles=True): # From 1.3.2 onwards, PyPlink sets unique names. info = info.iloc[0, :] variant_alleles = variant._encode_alleles([info.a2, info.a1]) if (_check_alleles and variant_alleles != variant.alleles): # Variant with requested alleles is unavailable. return [] geno = self._normalize_missing(self.bed.get_geno_marker(info.name)) return [Genotypes(variant, geno, info.a2, info.a1, False)] def _get_multialleic_variant(self, variant, info): # Check if alleles are specified. out = [] if variant.alleles is None: # If no alleles are specified, we return all the possible # bi-allelic variats. for name, row in info.iterrows(): geno = self.bed.get_geno_marker(name) geno = self._normalize_missing(geno) out.append(Genotypes(variant, geno, row.a2, row.a1, True)) else: # Find the requested alleles. for name, row in info.iterrows(): row_alleles = set(Variant._encode_alleles((row.a1, row.a2))) if row_alleles.issubset(variant.alleles_set): out.extend( self._get_biallelic_variant(variant, info.loc[[name], :], _check_alleles=False)) return out def iter_genotypes(self): """Iterates on available markers. Returns: Genotypes instances. Note ==== If the sample IDs are not unique, the index is changed to be the sample family ID and individual ID (i.e. fid_iid). """ # Iterating over all markers for i, (_, genotypes) in enumerate(self.bed.iter_geno()): info = self.bim.iloc[i, :] yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(genotypes), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic) def iter_variants(self): """Iterate over marker information.""" for idx, row in self.bim.iterrows(): yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos, [row.a1, row.a2]) def get_variants_in_region(self, chrom, start, end): """Iterate over variants in a region.""" bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom]) & (start <= self.bim["pos"]) & (self.bim["pos"] <= end)] for i, g in enumerate(self.bed.iter_geno_marker(bim.index)): info = bim.iloc[i, :] name, geno = g yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(geno), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic) def get_variant_by_name(self, name): """Get the genotype of a marker using it's name. Args: name (str): The name of the marker. Returns: list: A list of Genotypes (only one for PyPlink, see note below). Note ==== From PyPlink version 1.3.2 and onwards, each name is unique in the dataset. Hence, we can use the 'get_geno_marker' function and be sure only one variant is returned. """ # From 1.3.2 onwards, PyPlink sets unique names. # Getting the genotypes try: geno, i = self.bed.get_geno_marker(name, return_index=True) except ValueError: if name in self.bed.get_duplicated_markers(): # The variant is a duplicated one, so we go through all the # variants with the same name and the :dupx suffix return [ self.get_variant_by_name(dup_name).pop() for dup_name in self.bed.get_duplicated_markers()[name] ] else: # The variant is not in the BIM file, so we return an empty # list logger.warning("Variant {} was not found".format(name)) return [] else: info = self.bim.iloc[i, :] return [ Genotypes( Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(geno), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic, ) ] def get_number_samples(self): """Returns the number of samples. Returns: int: The number of samples. """ return self.bed.get_nb_samples() def get_number_variants(self): """Returns the number of markers. Returns: int: The number of markers. """ return self.bed.get_nb_markers() def get_samples(self): return list(self.fam.index) @staticmethod def _normalize_missing(g): """Normalize a plink genotype vector.""" g = g.astype(float) g[g == -1.0] = np.nan return g
class ReadPlink(object): """Reads plink files and allows random sampling""" def __init__(self, plinkstem): """ :param plinkstem: plink stem file path """ self._plinkstem = plinkstem self._bim_path = os.path.basename(self._plinkstem)+'.bim' self._bed_path = os.path.basename(self._plinkstem)+'.bed' self._fam_path = os.path.basename(self._plinkstem)+'.fam' self.plinkfile = PyPlink(self._plinkstem) self.fam = self.plinkfile.get_fam() self.bim = self.plinkfile.get_bim() self.N = self.fam.shape[0] self.P = self.bim.shape[0] self._sample_subjects = None self._sample_variants = None def sample(self, n, p, write_disk=False): """Samples from a plink file with random SNPs and subjects Currently pandas_plink does not support fancy indexing, hence sample will load the genotypes of all subjects before randomly sample subjects IDs. :param n: number of subjects to sample :param p: number of variants to sample :param write_disk: bool, write to disk a list of variants :returns: a numpy matrix of size n*p """ self._sample_subjects = np.random.choice(self.fam.index.values, n, replace=True) self._sample_variants = np.random.choice(self.bim.index.values, p) if write_disk: self.bim.iloc[self._sample_variants].to_csv('sampled_variants.csv') self.fam.iloc[self._sample_subjects].to_csv('sampled_subjects.csv') genotypematrix = self.read_bed(self._sample_variants, self._sample_subjects) return genotypematrix def read_bed(self, marker=None, subjects=None): """read bed file :param marker: list of SNPs :param subjects: list of subjects :returns: genotypematrix of size subjects*marker """ if marker is None: P_size = self.P marker = self.bim.index.values else: P_size = len(marker) if subjects is None: N_size = self.N subjects = self.fam.index.values else: N_size = len(subjects) genotypematrix = np.zeros((N_size, P_size), dtype=np.int8) j = 0 for m, g in self.plinkfile.iter_geno_marker(marker): genotypematrix[:,j] = g[subjects] j += 1 genotypematrix[genotypematrix < 0] = 0 return genotypematrix
if mafs: for x in mafs: MAF_RESULTS[str(x)] = np.zeros(N_samples) MAF_RESULTS_PASS[str(x)] = 0 MAF_RESULTS_SING[str(x)] = 0 MAF_QUALIFIED_SNPS[str(x)] = [] if macs: for x in macs: MAC_RESULTS[str(x)] = np.zeros(N_samples) MAC_RESULTS_PASS[str(x)] = 0 MAC_RESULTS_SING[str(x)] = 0 MAC_QUALIFIED_SNPS[str(x)] = [] # Goes for marker one by one. for marker_id, genotypes in bed.iter_geno_marker( record_weight.keys()): # The genotypes was coded as the number of 'a1' allele. # -1 as missing values. # np array, int8. # Subset to samples we want to use, counted the number of 'a1' allele genotypes = genotypes[select_samples] if bim.loc[marker_id, 'a1'].upper() == record_alt[marker_id]: # already counted the number of alt alleles, impute missing ref. genotypes[genotypes == -1] = 0 else: # the 'a2' is the ref allele, counted the number of ref allele. # impute missing as ref allele. genotypes[genotypes == -1] = 2 # switch from count ref allele to alt allele.