Python PyPlink.close Examples

Programming Language: Python

Namespace/Package Name: pyplink

Class/Type: PyPlink

Method/Function: close

Examples at hotexamples.com: 4

Python PyPlink.close - 4 examples found. These are the top rated real world Python examples of pyplink.PyPlink.close extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PyPlink(19)

get_bim(7)

get_fam(7)

iter_geno_marker(5)

close(4)

get_duplicated_markers(2)

get_geno_marker(2)

get_nb_markers(2)

get_nb_samples(2)

iter_geno(2)

write_genotypes(1)

Example #1

Show file

File: genereader.py Project: rmporsch/pyksburden

 def _read_gene(self, gene: str) -> np.array:
     temp = self.variants[self.variants.gene == gene]
     chrom = temp.chrom.unique()
     assert len(chrom) == 1
     lg.debug(temp.head())
     marker = temp.rsid.values
     lg.debug(marker)
     p = len(marker)
     assert p > 3
     genotype_matrix = np.zeros((self.n, p))
     reader = PyPlink(self.plink_path)
     u = 0
     lg.info('Reading %s', gene)
     for i, g in reader.iter_geno_marker(marker):
         genotype_matrix[:, u] = g
         u += 1
         lg.debug('Processed variant %s', i)
     genotype_matrix[genotype_matrix == -1] = 0
     reader.close()
     return genotype_matrix

Example #2

Show file

def get_genotypes(rsid, plink_path, sub_in):
    """
    Retrive genotype matrix from variant major format

    :param rsid: list of rsids
    :param plink_path: plink-stem path
    :param sub_in: list of subjects to inlucde
    :return: genotypematrix
    """
    reader = PyPlink(plink_path)
    lg.debug('First item of sub_in is %s with %s', sub_in[0], type(sub_in[0]))
    n = reader.get_nb_samples()
    genotypematrix = np.zeros((sum(sub_in), len(rsid)), dtype=np.int8)
    pos_index = 0
    for snp, genotype in reader.iter_geno_marker(rsid):
        if snp not in rsid:
            continue
        else:
            genotypematrix[:, pos_index] = genotype[sub_in]
            pos_index += 1
    reader.close()
    return genotypematrix

Example #3

Show file

File: plink.py Project: rochefca/geneparse

class PlinkReader(GenotypesReader):
    def __init__(self, prefix):
        """Binary plink file reader.
        Args:
            prefix (str): the prefix of the Plink binary files.

        """
        self.bed = PyPlink(prefix)
        self.bim = self.bed.get_bim()
        self.fam = self.bed.get_fam()

        # Identify all multi-allelics.
        self.bim["multiallelic"] = False
        self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False),
                     "multiallelic"] = True

        # We want to set the index for the FAM file
        try:
            self.fam = self.fam.set_index("iid", verify_integrity=True)
        except ValueError:
            logger.info(
                "Setting the index as 'fid_iid' because the individual IDs "
                "are not unique.")

            self.fam["fid_iid"] = [
                "{fid}_{iid}".format(fid=fid, iid=iid)
                for fid, iid in zip(self.fam.fid, self.fam.iid)
            ]
            self.fam = self.fam.set_index("fid_iid", verify_integrity=True)

    def close(self):
        self.bed.close()

    def get_variant_genotypes(self, variant):
        """Get the genotypes from a well formed variant instance.

        Args:
            marker (Variant): A Variant instance.

        Returns:
            A list of Genotypes instance containing a pointer to the variant as
            well as a vector of encoded genotypes.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Find the variant in the bim.
        plink_chrom = CHROM_STR_TO_INT[variant.chrom]
        info = self.bim.loc[(self.bim.chrom == plink_chrom) &
                            (self.bim.pos == variant.pos), :]

        if info.shape[0] == 0:
            return []

        elif info.shape[0] == 1:
            return self._get_biallelic_variant(variant, info)

        else:
            return self._get_multialleic_variant(variant, info)

    def _get_biallelic_variant(self, variant, info, _check_alleles=True):
        # From 1.3.2 onwards, PyPlink sets unique names.
        info = info.iloc[0, :]
        variant_alleles = variant._encode_alleles([info.a2, info.a1])
        if (_check_alleles and variant_alleles != variant.alleles):
            # Variant with requested alleles is unavailable.
            return []

        geno = self._normalize_missing(self.bed.get_geno_marker(info.name))
        return [Genotypes(variant, geno, info.a2, info.a1, False)]

    def _get_multialleic_variant(self, variant, info):
        # Check if alleles are specified.
        out = []
        if variant.alleles is None:
            # If no alleles are specified, we return all the possible
            # bi-allelic variats.
            for name, row in info.iterrows():
                geno = self.bed.get_geno_marker(name)
                geno = self._normalize_missing(geno)
                out.append(Genotypes(variant, geno, row.a2, row.a1, True))

        else:
            # Find the requested alleles.
            for name, row in info.iterrows():
                row_alleles = set(Variant._encode_alleles((row.a1, row.a2)))
                if row_alleles.issubset(variant.alleles_set):
                    out.extend(
                        self._get_biallelic_variant(variant,
                                                    info.loc[[name], :],
                                                    _check_alleles=False))

        return out

    def iter_genotypes(self):
        """Iterates on available markers.

        Returns:
            Genotypes instances.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Iterating over all markers
        for i, (_, genotypes) in enumerate(self.bed.iter_geno()):
            info = self.bim.iloc[i, :]

            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(genotypes),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def iter_variants(self):
        """Iterate over marker information."""
        for idx, row in self.bim.iterrows():
            yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos,
                          [row.a1, row.a2])

    def get_variants_in_region(self, chrom, start, end):
        """Iterate over variants in a region."""
        bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom])
                           & (start <= self.bim["pos"]) &
                           (self.bim["pos"] <= end)]
        for i, g in enumerate(self.bed.iter_geno_marker(bim.index)):
            info = bim.iloc[i, :]
            name, geno = g
            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(geno),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def get_variant_by_name(self, name):
        """Get the genotype of a marker using it's name.

        Args:
            name (str): The name of the marker.

        Returns:
            list: A list of Genotypes (only one for PyPlink, see note below).

        Note
        ====
            From PyPlink version 1.3.2 and onwards, each name is unique in the
            dataset. Hence, we can use the 'get_geno_marker' function and be
            sure only one variant is returned.

        """
        # From 1.3.2 onwards, PyPlink sets unique names.
        # Getting the genotypes
        try:
            geno, i = self.bed.get_geno_marker(name, return_index=True)

        except ValueError:
            if name in self.bed.get_duplicated_markers():
                # The variant is a duplicated one, so we go through all the
                # variants with the same name and the :dupx suffix
                return [
                    self.get_variant_by_name(dup_name).pop()
                    for dup_name in self.bed.get_duplicated_markers()[name]
                ]

            else:
                # The variant is not in the BIM file, so we return an empty
                # list
                logger.warning("Variant {} was not found".format(name))
                return []

        else:
            info = self.bim.iloc[i, :]
            return [
                Genotypes(
                    Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos,
                            [info.a1, info.a2]),
                    self._normalize_missing(geno),
                    reference=info.a2,
                    coded=info.a1,
                    multiallelic=info.multiallelic,
                )
            ]

    def get_number_samples(self):
        """Returns the number of samples.
        Returns:
            int: The number of samples.
        """
        return self.bed.get_nb_samples()

    def get_number_variants(self):
        """Returns the number of markers.
        Returns:
            int: The number of markers.
        """
        return self.bed.get_nb_markers()

    def get_samples(self):
        return list(self.fam.index)

    @staticmethod
    def _normalize_missing(g):
        """Normalize a plink genotype vector."""
        g = g.astype(float)
        g[g == -1.0] = np.nan
        return g

Example #4

Show file

File: genereader.py Project: rmporsch/pyksburden

class GeneReader(object):

    def __init__(self, plink_path: str, pheno_path: str, variant_path: str):
        lg.debug("""
        Loading the following files:
        Plink: %s Pheno %s Variants: %s
         """, plink_path, pheno_path, variant_path)
        assert os.path.isfile(plink_path+'.bed')
        assert os.path.isfile(variant_path)
        self.plink_path = plink_path
        self.variant_path = variant_path
        self.bfile = PyPlink(self.plink_path)
        self.bim = self.bfile.get_bim()
        self.bim['rsid'] = self.bim.index.values
        self.fam = self.bfile.get_fam()
        self.n_chrom = self.bim.chrom.nunique()
        self.variants = self._get_var(self.variant_path)
        self.genes = self.variants.gene.unique()
        self.pheno = self._get_pheno(pheno_path)
        self.bfile.close()

    def _get_var(self, variant_path: str) -> pd.DataFrame:
        dat = pd.read_table(variant_path, header=None)
        lg.debug(dat.head())
        nrow, ncol = dat.shape
        assert ncol == 4
        assert nrow > 3
        dat.columns = ['chrom', 'pos', 'rsid', 'gene']
        n_chrom = dat.chrom.nunique()
        chromosomes = dat.chrom.unique()
        n_genes = dat.gene.nunique()
        lg.info('Got %s genes in variant file', n_genes)
        lg.info('Got %s variants in variant file', nrow)
        lg.debug('Chromosomes: %s', n_chrom)
        chrom_check = [k for k in chromosomes if k in self.bim.chrom.unique()]
        lg.info('Found %s out of %s chromosomes in bim file',
                len(chrom_check), self.n_chrom)
        lg.debug(self.bim.head())
        dat = pd.merge(dat, self.bim, on=['pos', 'chrom', 'rsid'],
                       how='inner')
        n_var = dat.shape[0]
        lg.info('After merging with the bim file there are %s variants left',
                n_var)
        if n_var < nrow:
            lg.warning('After merging I lost %s variants',
                       nrow - n_var)
        return dat

    def _get_pheno(self, pheno_file: str) -> pd.DataFrame:
        dat = pd.read_table(pheno_file, header=None)
        nrow, ncol = dat.shape
        assert ncol >= 3
        assert nrow > 1
        lg.debug(dat.head())
        if ncol == 3:
            dat.columns = ['fid', 'iid', 'Pheno']
            dat['fid'] = dat['fid'].astype(str)
            dat['iid'] = dat['fid'].astype(str)
        elif ncol == 6:
            dat.columns = ['fid', 'iid', 'father', 'mother', 'gender', 'Pheno']
            dat['fid'] = dat['fid'].astype(str)
            dat['iid'] = dat['fid'].astype(str)
        else:
            raise ValueError('Need at either a 3 or 6 column file')
        lg.debug(self.fam.head())
        dat = pd.merge(self.fam, dat, on=['fid', 'iid'])
        self.n = dat.shape[0]
        lg.info('Using %s out of %s samples', self.n, nrow)
        if self.n < nrow:
            lg.warning('%s samples not in fam file', (nrow - self.n))
            if self.n < 2:
                raise AssertionError('Sample size is smaller than 2.')
        self.case_controls = (dat.Pheno > 0).values
        lg.info('Found %s cases and %s controls',
                np.sum(self.case_controls), np.sum(~self.case_controls))
        return dat

    def _read_gene(self, gene: str) -> np.array:
        temp = self.variants[self.variants.gene == gene]
        chrom = temp.chrom.unique()
        assert len(chrom) == 1
        lg.debug(temp.head())
        marker = temp.rsid.values
        lg.debug(marker)
        p = len(marker)
        assert p > 3
        genotype_matrix = np.zeros((self.n, p))
        reader = PyPlink(self.plink_path)
        u = 0
        lg.info('Reading %s', gene)
        for i, g in reader.iter_geno_marker(marker):
            genotype_matrix[:, u] = g
            u += 1
            lg.debug('Processed variant %s', i)
        genotype_matrix[genotype_matrix == -1] = 0
        reader.close()
        return genotype_matrix

    def gene_iterator(self, genes=None) -> np.array:
        if genes is None:
            genes = self.genes
        for gene_name in genes:
            lg.debug('Getting gene %s', gene_name)
            yield self._read_gene(gene_name)