Ejemplo n.º 1
0
 def _read_gene(self, gene: str) -> np.array:
     temp = self.variants[self.variants.gene == gene]
     chrom = temp.chrom.unique()
     assert len(chrom) == 1
     lg.debug(temp.head())
     marker = temp.rsid.values
     lg.debug(marker)
     p = len(marker)
     assert p > 3
     genotype_matrix = np.zeros((self.n, p))
     reader = PyPlink(self.plink_path)
     u = 0
     lg.info('Reading %s', gene)
     for i, g in reader.iter_geno_marker(marker):
         genotype_matrix[:, u] = g
         u += 1
         lg.debug('Processed variant %s', i)
     genotype_matrix[genotype_matrix == -1] = 0
     reader.close()
     return genotype_matrix
Ejemplo n.º 2
0
def get_genotypes(rsid, plink_path, sub_in):
    """
    Retrive genotype matrix from variant major format

    :param rsid: list of rsids
    :param plink_path: plink-stem path
    :param sub_in: list of subjects to inlucde
    :return: genotypematrix
    """
    reader = PyPlink(plink_path)
    lg.debug('First item of sub_in is %s with %s', sub_in[0], type(sub_in[0]))
    n = reader.get_nb_samples()
    genotypematrix = np.zeros((sum(sub_in), len(rsid)), dtype=np.int8)
    pos_index = 0
    for snp, genotype in reader.iter_geno_marker(rsid):
        if snp not in rsid:
            continue
        else:
            genotypematrix[:, pos_index] = genotype[sub_in]
            pos_index += 1
    reader.close()
    return genotypematrix
Ejemplo n.º 3
0
class PlinkReader(GenotypesReader):
    def __init__(self, prefix):
        """Binary plink file reader.
        Args:
            prefix (str): the prefix of the Plink binary files.

        """
        self.bed = PyPlink(prefix)
        self.bim = self.bed.get_bim()
        self.fam = self.bed.get_fam()

        # Identify all multi-allelics.
        self.bim["multiallelic"] = False
        self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False),
                     "multiallelic"] = True

        # We want to set the index for the FAM file
        try:
            self.fam = self.fam.set_index("iid", verify_integrity=True)
        except ValueError:
            logger.info(
                "Setting the index as 'fid_iid' because the individual IDs "
                "are not unique.")

            self.fam["fid_iid"] = [
                "{fid}_{iid}".format(fid=fid, iid=iid)
                for fid, iid in zip(self.fam.fid, self.fam.iid)
            ]
            self.fam = self.fam.set_index("fid_iid", verify_integrity=True)

    def close(self):
        self.bed.close()

    def get_variant_genotypes(self, variant):
        """Get the genotypes from a well formed variant instance.

        Args:
            marker (Variant): A Variant instance.

        Returns:
            A list of Genotypes instance containing a pointer to the variant as
            well as a vector of encoded genotypes.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Find the variant in the bim.
        plink_chrom = CHROM_STR_TO_INT[variant.chrom]
        info = self.bim.loc[(self.bim.chrom == plink_chrom) &
                            (self.bim.pos == variant.pos), :]

        if info.shape[0] == 0:
            return []

        elif info.shape[0] == 1:
            return self._get_biallelic_variant(variant, info)

        else:
            return self._get_multialleic_variant(variant, info)

    def _get_biallelic_variant(self, variant, info, _check_alleles=True):
        # From 1.3.2 onwards, PyPlink sets unique names.
        info = info.iloc[0, :]
        variant_alleles = variant._encode_alleles([info.a2, info.a1])
        if (_check_alleles and variant_alleles != variant.alleles):
            # Variant with requested alleles is unavailable.
            return []

        geno = self._normalize_missing(self.bed.get_geno_marker(info.name))
        return [Genotypes(variant, geno, info.a2, info.a1, False)]

    def _get_multialleic_variant(self, variant, info):
        # Check if alleles are specified.
        out = []
        if variant.alleles is None:
            # If no alleles are specified, we return all the possible
            # bi-allelic variats.
            for name, row in info.iterrows():
                geno = self.bed.get_geno_marker(name)
                geno = self._normalize_missing(geno)
                out.append(Genotypes(variant, geno, row.a2, row.a1, True))

        else:
            # Find the requested alleles.
            for name, row in info.iterrows():
                row_alleles = set(Variant._encode_alleles((row.a1, row.a2)))
                if row_alleles.issubset(variant.alleles_set):
                    out.extend(
                        self._get_biallelic_variant(variant,
                                                    info.loc[[name], :],
                                                    _check_alleles=False))

        return out

    def iter_genotypes(self):
        """Iterates on available markers.

        Returns:
            Genotypes instances.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Iterating over all markers
        for i, (_, genotypes) in enumerate(self.bed.iter_geno()):
            info = self.bim.iloc[i, :]

            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(genotypes),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def iter_variants(self):
        """Iterate over marker information."""
        for idx, row in self.bim.iterrows():
            yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos,
                          [row.a1, row.a2])

    def get_variants_in_region(self, chrom, start, end):
        """Iterate over variants in a region."""
        bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom])
                           & (start <= self.bim["pos"]) &
                           (self.bim["pos"] <= end)]
        for i, g in enumerate(self.bed.iter_geno_marker(bim.index)):
            info = bim.iloc[i, :]
            name, geno = g
            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(geno),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def get_variant_by_name(self, name):
        """Get the genotype of a marker using it's name.

        Args:
            name (str): The name of the marker.

        Returns:
            list: A list of Genotypes (only one for PyPlink, see note below).

        Note
        ====
            From PyPlink version 1.3.2 and onwards, each name is unique in the
            dataset. Hence, we can use the 'get_geno_marker' function and be
            sure only one variant is returned.

        """
        # From 1.3.2 onwards, PyPlink sets unique names.
        # Getting the genotypes
        try:
            geno, i = self.bed.get_geno_marker(name, return_index=True)

        except ValueError:
            if name in self.bed.get_duplicated_markers():
                # The variant is a duplicated one, so we go through all the
                # variants with the same name and the :dupx suffix
                return [
                    self.get_variant_by_name(dup_name).pop()
                    for dup_name in self.bed.get_duplicated_markers()[name]
                ]

            else:
                # The variant is not in the BIM file, so we return an empty
                # list
                logger.warning("Variant {} was not found".format(name))
                return []

        else:
            info = self.bim.iloc[i, :]
            return [
                Genotypes(
                    Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos,
                            [info.a1, info.a2]),
                    self._normalize_missing(geno),
                    reference=info.a2,
                    coded=info.a1,
                    multiallelic=info.multiallelic,
                )
            ]

    def get_number_samples(self):
        """Returns the number of samples.
        Returns:
            int: The number of samples.
        """
        return self.bed.get_nb_samples()

    def get_number_variants(self):
        """Returns the number of markers.
        Returns:
            int: The number of markers.
        """
        return self.bed.get_nb_markers()

    def get_samples(self):
        return list(self.fam.index)

    @staticmethod
    def _normalize_missing(g):
        """Normalize a plink genotype vector."""
        g = g.astype(float)
        g[g == -1.0] = np.nan
        return g
Ejemplo n.º 4
0
class ReadPlink(object):

    """Reads plink files and allows random sampling"""

    def __init__(self, plinkstem):
        """
        :param plinkstem: plink stem file path

        """
        self._plinkstem = plinkstem
        self._bim_path = os.path.basename(self._plinkstem)+'.bim'
        self._bed_path = os.path.basename(self._plinkstem)+'.bed'
        self._fam_path = os.path.basename(self._plinkstem)+'.fam'

        self.plinkfile = PyPlink(self._plinkstem)
        self.fam = self.plinkfile.get_fam()
        self.bim = self.plinkfile.get_bim()
        self.N = self.fam.shape[0]
        self.P = self.bim.shape[0]
        self._sample_subjects = None
        self._sample_variants = None


    def sample(self, n, p, write_disk=False):
        """Samples from a plink file with random SNPs and subjects
        Currently pandas_plink does not support fancy indexing, hence
        sample will load the genotypes of all subjects before randomly sample
        subjects IDs.

        :param n: number of subjects to sample
        :param p: number of variants to sample
        :param write_disk: bool, write to disk a list of variants
        :returns: a numpy matrix of size n*p

        """
        self._sample_subjects = np.random.choice(self.fam.index.values, n, replace=True)
        self._sample_variants = np.random.choice(self.bim.index.values, p)

        if write_disk:
            self.bim.iloc[self._sample_variants].to_csv('sampled_variants.csv')
            self.fam.iloc[self._sample_subjects].to_csv('sampled_subjects.csv')

        genotypematrix = self.read_bed(self._sample_variants,
                                        self._sample_subjects)

        return genotypematrix

    def read_bed(self, marker=None, subjects=None):
        """read bed file

        :param marker: list of SNPs
        :param subjects: list of subjects
        :returns: genotypematrix of size subjects*marker

        """
        if marker is None:
            P_size = self.P
            marker = self.bim.index.values
        else:
            P_size = len(marker)

        if subjects is None:
            N_size = self.N
            subjects = self.fam.index.values
        else:
            N_size = len(subjects)

        genotypematrix = np.zeros((N_size, P_size), dtype=np.int8)

        j = 0
        for m, g in self.plinkfile.iter_geno_marker(marker):
            genotypematrix[:,j] = g[subjects]
            j += 1

        genotypematrix[genotypematrix < 0] = 0

        return genotypematrix
Ejemplo n.º 5
0
                if mafs:
                    for x in mafs:
                        MAF_RESULTS[str(x)] = np.zeros(N_samples)
                        MAF_RESULTS_PASS[str(x)] = 0
                        MAF_RESULTS_SING[str(x)] = 0
                        MAF_QUALIFIED_SNPS[str(x)] = []
                if macs:
                    for x in macs:
                        MAC_RESULTS[str(x)] = np.zeros(N_samples)
                        MAC_RESULTS_PASS[str(x)] = 0
                        MAC_RESULTS_SING[str(x)] = 0
                        MAC_QUALIFIED_SNPS[str(x)] = []

                # Goes for marker one by one.
                for marker_id, genotypes in bed.iter_geno_marker(
                        record_weight.keys()):
                    # The genotypes was coded as the number of 'a1' allele.
                    # -1 as missing values.
                    # np array, int8.

                    # Subset to samples we want to use, counted the number of 'a1' allele
                    genotypes = genotypes[select_samples]
                    if bim.loc[marker_id,
                               'a1'].upper() == record_alt[marker_id]:
                        # already counted the number of alt alleles, impute missing ref.
                        genotypes[genotypes == -1] = 0
                    else:
                        # the 'a2' is the ref allele, counted the number of ref allele.
                        # impute missing as ref allele.
                        genotypes[genotypes == -1] = 2
                        # switch from count ref allele to alt allele.