def _read_gene(self, gene: str) -> np.array: temp = self.variants[self.variants.gene == gene] chrom = temp.chrom.unique() assert len(chrom) == 1 lg.debug(temp.head()) marker = temp.rsid.values lg.debug(marker) p = len(marker) assert p > 3 genotype_matrix = np.zeros((self.n, p)) reader = PyPlink(self.plink_path) u = 0 lg.info('Reading %s', gene) for i, g in reader.iter_geno_marker(marker): genotype_matrix[:, u] = g u += 1 lg.debug('Processed variant %s', i) genotype_matrix[genotype_matrix == -1] = 0 reader.close() return genotype_matrix
def get_genotypes(rsid, plink_path, sub_in): """ Retrive genotype matrix from variant major format :param rsid: list of rsids :param plink_path: plink-stem path :param sub_in: list of subjects to inlucde :return: genotypematrix """ reader = PyPlink(plink_path) lg.debug('First item of sub_in is %s with %s', sub_in[0], type(sub_in[0])) n = reader.get_nb_samples() genotypematrix = np.zeros((sum(sub_in), len(rsid)), dtype=np.int8) pos_index = 0 for snp, genotype in reader.iter_geno_marker(rsid): if snp not in rsid: continue else: genotypematrix[:, pos_index] = genotype[sub_in] pos_index += 1 reader.close() return genotypematrix
class PlinkReader(GenotypesReader): def __init__(self, prefix): """Binary plink file reader. Args: prefix (str): the prefix of the Plink binary files. """ self.bed = PyPlink(prefix) self.bim = self.bed.get_bim() self.fam = self.bed.get_fam() # Identify all multi-allelics. self.bim["multiallelic"] = False self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False), "multiallelic"] = True # We want to set the index for the FAM file try: self.fam = self.fam.set_index("iid", verify_integrity=True) except ValueError: logger.info( "Setting the index as 'fid_iid' because the individual IDs " "are not unique.") self.fam["fid_iid"] = [ "{fid}_{iid}".format(fid=fid, iid=iid) for fid, iid in zip(self.fam.fid, self.fam.iid) ] self.fam = self.fam.set_index("fid_iid", verify_integrity=True) def close(self): self.bed.close() def get_variant_genotypes(self, variant): """Get the genotypes from a well formed variant instance. Args: marker (Variant): A Variant instance. Returns: A list of Genotypes instance containing a pointer to the variant as well as a vector of encoded genotypes. Note ==== If the sample IDs are not unique, the index is changed to be the sample family ID and individual ID (i.e. fid_iid). """ # Find the variant in the bim. plink_chrom = CHROM_STR_TO_INT[variant.chrom] info = self.bim.loc[(self.bim.chrom == plink_chrom) & (self.bim.pos == variant.pos), :] if info.shape[0] == 0: return [] elif info.shape[0] == 1: return self._get_biallelic_variant(variant, info) else: return self._get_multialleic_variant(variant, info) def _get_biallelic_variant(self, variant, info, _check_alleles=True): # From 1.3.2 onwards, PyPlink sets unique names. info = info.iloc[0, :] variant_alleles = variant._encode_alleles([info.a2, info.a1]) if (_check_alleles and variant_alleles != variant.alleles): # Variant with requested alleles is unavailable. return [] geno = self._normalize_missing(self.bed.get_geno_marker(info.name)) return [Genotypes(variant, geno, info.a2, info.a1, False)] def _get_multialleic_variant(self, variant, info): # Check if alleles are specified. out = [] if variant.alleles is None: # If no alleles are specified, we return all the possible # bi-allelic variats. for name, row in info.iterrows(): geno = self.bed.get_geno_marker(name) geno = self._normalize_missing(geno) out.append(Genotypes(variant, geno, row.a2, row.a1, True)) else: # Find the requested alleles. for name, row in info.iterrows(): row_alleles = set(Variant._encode_alleles((row.a1, row.a2))) if row_alleles.issubset(variant.alleles_set): out.extend( self._get_biallelic_variant(variant, info.loc[[name], :], _check_alleles=False)) return out def iter_genotypes(self): """Iterates on available markers. Returns: Genotypes instances. Note ==== If the sample IDs are not unique, the index is changed to be the sample family ID and individual ID (i.e. fid_iid). """ # Iterating over all markers for i, (_, genotypes) in enumerate(self.bed.iter_geno()): info = self.bim.iloc[i, :] yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(genotypes), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic) def iter_variants(self): """Iterate over marker information.""" for idx, row in self.bim.iterrows(): yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos, [row.a1, row.a2]) def get_variants_in_region(self, chrom, start, end): """Iterate over variants in a region.""" bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom]) & (start <= self.bim["pos"]) & (self.bim["pos"] <= end)] for i, g in enumerate(self.bed.iter_geno_marker(bim.index)): info = bim.iloc[i, :] name, geno = g yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(geno), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic) def get_variant_by_name(self, name): """Get the genotype of a marker using it's name. Args: name (str): The name of the marker. Returns: list: A list of Genotypes (only one for PyPlink, see note below). Note ==== From PyPlink version 1.3.2 and onwards, each name is unique in the dataset. Hence, we can use the 'get_geno_marker' function and be sure only one variant is returned. """ # From 1.3.2 onwards, PyPlink sets unique names. # Getting the genotypes try: geno, i = self.bed.get_geno_marker(name, return_index=True) except ValueError: if name in self.bed.get_duplicated_markers(): # The variant is a duplicated one, so we go through all the # variants with the same name and the :dupx suffix return [ self.get_variant_by_name(dup_name).pop() for dup_name in self.bed.get_duplicated_markers()[name] ] else: # The variant is not in the BIM file, so we return an empty # list logger.warning("Variant {} was not found".format(name)) return [] else: info = self.bim.iloc[i, :] return [ Genotypes( Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos, [info.a1, info.a2]), self._normalize_missing(geno), reference=info.a2, coded=info.a1, multiallelic=info.multiallelic, ) ] def get_number_samples(self): """Returns the number of samples. Returns: int: The number of samples. """ return self.bed.get_nb_samples() def get_number_variants(self): """Returns the number of markers. Returns: int: The number of markers. """ return self.bed.get_nb_markers() def get_samples(self): return list(self.fam.index) @staticmethod def _normalize_missing(g): """Normalize a plink genotype vector.""" g = g.astype(float) g[g == -1.0] = np.nan return g
class GeneReader(object): def __init__(self, plink_path: str, pheno_path: str, variant_path: str): lg.debug(""" Loading the following files: Plink: %s Pheno %s Variants: %s """, plink_path, pheno_path, variant_path) assert os.path.isfile(plink_path+'.bed') assert os.path.isfile(variant_path) self.plink_path = plink_path self.variant_path = variant_path self.bfile = PyPlink(self.plink_path) self.bim = self.bfile.get_bim() self.bim['rsid'] = self.bim.index.values self.fam = self.bfile.get_fam() self.n_chrom = self.bim.chrom.nunique() self.variants = self._get_var(self.variant_path) self.genes = self.variants.gene.unique() self.pheno = self._get_pheno(pheno_path) self.bfile.close() def _get_var(self, variant_path: str) -> pd.DataFrame: dat = pd.read_table(variant_path, header=None) lg.debug(dat.head()) nrow, ncol = dat.shape assert ncol == 4 assert nrow > 3 dat.columns = ['chrom', 'pos', 'rsid', 'gene'] n_chrom = dat.chrom.nunique() chromosomes = dat.chrom.unique() n_genes = dat.gene.nunique() lg.info('Got %s genes in variant file', n_genes) lg.info('Got %s variants in variant file', nrow) lg.debug('Chromosomes: %s', n_chrom) chrom_check = [k for k in chromosomes if k in self.bim.chrom.unique()] lg.info('Found %s out of %s chromosomes in bim file', len(chrom_check), self.n_chrom) lg.debug(self.bim.head()) dat = pd.merge(dat, self.bim, on=['pos', 'chrom', 'rsid'], how='inner') n_var = dat.shape[0] lg.info('After merging with the bim file there are %s variants left', n_var) if n_var < nrow: lg.warning('After merging I lost %s variants', nrow - n_var) return dat def _get_pheno(self, pheno_file: str) -> pd.DataFrame: dat = pd.read_table(pheno_file, header=None) nrow, ncol = dat.shape assert ncol >= 3 assert nrow > 1 lg.debug(dat.head()) if ncol == 3: dat.columns = ['fid', 'iid', 'Pheno'] dat['fid'] = dat['fid'].astype(str) dat['iid'] = dat['fid'].astype(str) elif ncol == 6: dat.columns = ['fid', 'iid', 'father', 'mother', 'gender', 'Pheno'] dat['fid'] = dat['fid'].astype(str) dat['iid'] = dat['fid'].astype(str) else: raise ValueError('Need at either a 3 or 6 column file') lg.debug(self.fam.head()) dat = pd.merge(self.fam, dat, on=['fid', 'iid']) self.n = dat.shape[0] lg.info('Using %s out of %s samples', self.n, nrow) if self.n < nrow: lg.warning('%s samples not in fam file', (nrow - self.n)) if self.n < 2: raise AssertionError('Sample size is smaller than 2.') self.case_controls = (dat.Pheno > 0).values lg.info('Found %s cases and %s controls', np.sum(self.case_controls), np.sum(~self.case_controls)) return dat def _read_gene(self, gene: str) -> np.array: temp = self.variants[self.variants.gene == gene] chrom = temp.chrom.unique() assert len(chrom) == 1 lg.debug(temp.head()) marker = temp.rsid.values lg.debug(marker) p = len(marker) assert p > 3 genotype_matrix = np.zeros((self.n, p)) reader = PyPlink(self.plink_path) u = 0 lg.info('Reading %s', gene) for i, g in reader.iter_geno_marker(marker): genotype_matrix[:, u] = g u += 1 lg.debug('Processed variant %s', i) genotype_matrix[genotype_matrix == -1] = 0 reader.close() return genotype_matrix def gene_iterator(self, genes=None) -> np.array: if genes is None: genes = self.genes for gene_name in genes: lg.debug('Getting gene %s', gene_name) yield self._read_gene(gene_name)