Example #1
0
 def _read_gene(self, gene: str) -> np.array:
     temp = self.variants[self.variants.gene == gene]
     chrom = temp.chrom.unique()
     assert len(chrom) == 1
     lg.debug(temp.head())
     marker = temp.rsid.values
     lg.debug(marker)
     p = len(marker)
     assert p > 3
     genotype_matrix = np.zeros((self.n, p))
     reader = PyPlink(self.plink_path)
     u = 0
     lg.info('Reading %s', gene)
     for i, g in reader.iter_geno_marker(marker):
         genotype_matrix[:, u] = g
         u += 1
         lg.debug('Processed variant %s', i)
     genotype_matrix[genotype_matrix == -1] = 0
     reader.close()
     return genotype_matrix
Example #2
0
def get_genotypes(rsid, plink_path, sub_in):
    """
    Retrive genotype matrix from variant major format

    :param rsid: list of rsids
    :param plink_path: plink-stem path
    :param sub_in: list of subjects to inlucde
    :return: genotypematrix
    """
    reader = PyPlink(plink_path)
    lg.debug('First item of sub_in is %s with %s', sub_in[0], type(sub_in[0]))
    n = reader.get_nb_samples()
    genotypematrix = np.zeros((sum(sub_in), len(rsid)), dtype=np.int8)
    pos_index = 0
    for snp, genotype in reader.iter_geno_marker(rsid):
        if snp not in rsid:
            continue
        else:
            genotypematrix[:, pos_index] = genotype[sub_in]
            pos_index += 1
    reader.close()
    return genotypematrix
Example #3
0
class PlinkReader(GenotypesReader):
    def __init__(self, prefix):
        """Binary plink file reader.
        Args:
            prefix (str): the prefix of the Plink binary files.

        """
        self.bed = PyPlink(prefix)
        self.bim = self.bed.get_bim()
        self.fam = self.bed.get_fam()

        # Identify all multi-allelics.
        self.bim["multiallelic"] = False
        self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False),
                     "multiallelic"] = True

        # We want to set the index for the FAM file
        try:
            self.fam = self.fam.set_index("iid", verify_integrity=True)
        except ValueError:
            logger.info(
                "Setting the index as 'fid_iid' because the individual IDs "
                "are not unique.")

            self.fam["fid_iid"] = [
                "{fid}_{iid}".format(fid=fid, iid=iid)
                for fid, iid in zip(self.fam.fid, self.fam.iid)
            ]
            self.fam = self.fam.set_index("fid_iid", verify_integrity=True)

    def close(self):
        self.bed.close()

    def get_variant_genotypes(self, variant):
        """Get the genotypes from a well formed variant instance.

        Args:
            marker (Variant): A Variant instance.

        Returns:
            A list of Genotypes instance containing a pointer to the variant as
            well as a vector of encoded genotypes.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Find the variant in the bim.
        plink_chrom = CHROM_STR_TO_INT[variant.chrom]
        info = self.bim.loc[(self.bim.chrom == plink_chrom) &
                            (self.bim.pos == variant.pos), :]

        if info.shape[0] == 0:
            return []

        elif info.shape[0] == 1:
            return self._get_biallelic_variant(variant, info)

        else:
            return self._get_multialleic_variant(variant, info)

    def _get_biallelic_variant(self, variant, info, _check_alleles=True):
        # From 1.3.2 onwards, PyPlink sets unique names.
        info = info.iloc[0, :]
        variant_alleles = variant._encode_alleles([info.a2, info.a1])
        if (_check_alleles and variant_alleles != variant.alleles):
            # Variant with requested alleles is unavailable.
            return []

        geno = self._normalize_missing(self.bed.get_geno_marker(info.name))
        return [Genotypes(variant, geno, info.a2, info.a1, False)]

    def _get_multialleic_variant(self, variant, info):
        # Check if alleles are specified.
        out = []
        if variant.alleles is None:
            # If no alleles are specified, we return all the possible
            # bi-allelic variats.
            for name, row in info.iterrows():
                geno = self.bed.get_geno_marker(name)
                geno = self._normalize_missing(geno)
                out.append(Genotypes(variant, geno, row.a2, row.a1, True))

        else:
            # Find the requested alleles.
            for name, row in info.iterrows():
                row_alleles = set(Variant._encode_alleles((row.a1, row.a2)))
                if row_alleles.issubset(variant.alleles_set):
                    out.extend(
                        self._get_biallelic_variant(variant,
                                                    info.loc[[name], :],
                                                    _check_alleles=False))

        return out

    def iter_genotypes(self):
        """Iterates on available markers.

        Returns:
            Genotypes instances.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Iterating over all markers
        for i, (_, genotypes) in enumerate(self.bed.iter_geno()):
            info = self.bim.iloc[i, :]

            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(genotypes),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def iter_variants(self):
        """Iterate over marker information."""
        for idx, row in self.bim.iterrows():
            yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos,
                          [row.a1, row.a2])

    def get_variants_in_region(self, chrom, start, end):
        """Iterate over variants in a region."""
        bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom])
                           & (start <= self.bim["pos"]) &
                           (self.bim["pos"] <= end)]
        for i, g in enumerate(self.bed.iter_geno_marker(bim.index)):
            info = bim.iloc[i, :]
            name, geno = g
            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(geno),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def get_variant_by_name(self, name):
        """Get the genotype of a marker using it's name.

        Args:
            name (str): The name of the marker.

        Returns:
            list: A list of Genotypes (only one for PyPlink, see note below).

        Note
        ====
            From PyPlink version 1.3.2 and onwards, each name is unique in the
            dataset. Hence, we can use the 'get_geno_marker' function and be
            sure only one variant is returned.

        """
        # From 1.3.2 onwards, PyPlink sets unique names.
        # Getting the genotypes
        try:
            geno, i = self.bed.get_geno_marker(name, return_index=True)

        except ValueError:
            if name in self.bed.get_duplicated_markers():
                # The variant is a duplicated one, so we go through all the
                # variants with the same name and the :dupx suffix
                return [
                    self.get_variant_by_name(dup_name).pop()
                    for dup_name in self.bed.get_duplicated_markers()[name]
                ]

            else:
                # The variant is not in the BIM file, so we return an empty
                # list
                logger.warning("Variant {} was not found".format(name))
                return []

        else:
            info = self.bim.iloc[i, :]
            return [
                Genotypes(
                    Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos,
                            [info.a1, info.a2]),
                    self._normalize_missing(geno),
                    reference=info.a2,
                    coded=info.a1,
                    multiallelic=info.multiallelic,
                )
            ]

    def get_number_samples(self):
        """Returns the number of samples.
        Returns:
            int: The number of samples.
        """
        return self.bed.get_nb_samples()

    def get_number_variants(self):
        """Returns the number of markers.
        Returns:
            int: The number of markers.
        """
        return self.bed.get_nb_markers()

    def get_samples(self):
        return list(self.fam.index)

    @staticmethod
    def _normalize_missing(g):
        """Normalize a plink genotype vector."""
        g = g.astype(float)
        g[g == -1.0] = np.nan
        return g
Example #4
0
class GeneReader(object):

    def __init__(self, plink_path: str, pheno_path: str, variant_path: str):
        lg.debug("""
        Loading the following files:
        Plink: %s Pheno %s Variants: %s
         """, plink_path, pheno_path, variant_path)
        assert os.path.isfile(plink_path+'.bed')
        assert os.path.isfile(variant_path)
        self.plink_path = plink_path
        self.variant_path = variant_path
        self.bfile = PyPlink(self.plink_path)
        self.bim = self.bfile.get_bim()
        self.bim['rsid'] = self.bim.index.values
        self.fam = self.bfile.get_fam()
        self.n_chrom = self.bim.chrom.nunique()
        self.variants = self._get_var(self.variant_path)
        self.genes = self.variants.gene.unique()
        self.pheno = self._get_pheno(pheno_path)
        self.bfile.close()

    def _get_var(self, variant_path: str) -> pd.DataFrame:
        dat = pd.read_table(variant_path, header=None)
        lg.debug(dat.head())
        nrow, ncol = dat.shape
        assert ncol == 4
        assert nrow > 3
        dat.columns = ['chrom', 'pos', 'rsid', 'gene']
        n_chrom = dat.chrom.nunique()
        chromosomes = dat.chrom.unique()
        n_genes = dat.gene.nunique()
        lg.info('Got %s genes in variant file', n_genes)
        lg.info('Got %s variants in variant file', nrow)
        lg.debug('Chromosomes: %s', n_chrom)
        chrom_check = [k for k in chromosomes if k in self.bim.chrom.unique()]
        lg.info('Found %s out of %s chromosomes in bim file',
                len(chrom_check), self.n_chrom)
        lg.debug(self.bim.head())
        dat = pd.merge(dat, self.bim, on=['pos', 'chrom', 'rsid'],
                       how='inner')
        n_var = dat.shape[0]
        lg.info('After merging with the bim file there are %s variants left',
                n_var)
        if n_var < nrow:
            lg.warning('After merging I lost %s variants',
                       nrow - n_var)
        return dat

    def _get_pheno(self, pheno_file: str) -> pd.DataFrame:
        dat = pd.read_table(pheno_file, header=None)
        nrow, ncol = dat.shape
        assert ncol >= 3
        assert nrow > 1
        lg.debug(dat.head())
        if ncol == 3:
            dat.columns = ['fid', 'iid', 'Pheno']
            dat['fid'] = dat['fid'].astype(str)
            dat['iid'] = dat['fid'].astype(str)
        elif ncol == 6:
            dat.columns = ['fid', 'iid', 'father', 'mother', 'gender', 'Pheno']
            dat['fid'] = dat['fid'].astype(str)
            dat['iid'] = dat['fid'].astype(str)
        else:
            raise ValueError('Need at either a 3 or 6 column file')
        lg.debug(self.fam.head())
        dat = pd.merge(self.fam, dat, on=['fid', 'iid'])
        self.n = dat.shape[0]
        lg.info('Using %s out of %s samples', self.n, nrow)
        if self.n < nrow:
            lg.warning('%s samples not in fam file', (nrow - self.n))
            if self.n < 2:
                raise AssertionError('Sample size is smaller than 2.')
        self.case_controls = (dat.Pheno > 0).values
        lg.info('Found %s cases and %s controls',
                np.sum(self.case_controls), np.sum(~self.case_controls))
        return dat

    def _read_gene(self, gene: str) -> np.array:
        temp = self.variants[self.variants.gene == gene]
        chrom = temp.chrom.unique()
        assert len(chrom) == 1
        lg.debug(temp.head())
        marker = temp.rsid.values
        lg.debug(marker)
        p = len(marker)
        assert p > 3
        genotype_matrix = np.zeros((self.n, p))
        reader = PyPlink(self.plink_path)
        u = 0
        lg.info('Reading %s', gene)
        for i, g in reader.iter_geno_marker(marker):
            genotype_matrix[:, u] = g
            u += 1
            lg.debug('Processed variant %s', i)
        genotype_matrix[genotype_matrix == -1] = 0
        reader.close()
        return genotype_matrix

    def gene_iterator(self, genes=None) -> np.array:
        if genes is None:
            genes = self.genes
        for gene_name in genes:
            lg.debug('Getting gene %s', gene_name)
            yield self._read_gene(gene_name)