Ejemplo n.º 1
0
def load_bed(geno_temp, chromo=1, indiv=None, mid_buffer=2e6):
    '''load bed-file and split in effect and null SNPs
    
    # Parameters:
    chromo (int): chromosome number
    indiv (None or np.ndarray):
                        if None, use all individuals in bed-file
                        if np.array, use elements as indivduals
    mid_buffer (int): number of bp up- and downstream of center of chromosome to leave out

    # Returns:
    G (pd.DataFrame): DataFrame with one indiv per row, one SNP per column
    eff_snps, null_snps (np.ndarray): array of ints with position of effect- and null-SNPs
    rsid (np.ndarray) array of str with rsid names of SNPs in G
    '''
    print(chromo)
    bed = PyPlink(geno_temp % chromo)
    print(bed.get_nb_markers())
    fam = bed.get_fam()
    if indiv is None:
        indiv = fam.iid.astype(int)

    ind = np.isin(fam.iid.astype(int), indiv)
    indiv = fam.loc[ind, 'iid'].astype(int).values

    G = []
    rsids = []
    removed = 0
    for g in tqdm(bed.iter_geno(), total=bed.get_nb_markers()):
        rs = g[0]
        gen = g[1][ind]
        g_ind = gen == -1
        if g_ind.mean() < 0.1:
            gen[g_ind] = gen[~g_ind].mean()
            G.append(gen)
            rsids.append(rs)
        else:
            removed += 1
    print(f'removed {removed} SNPs due to missing>10%')

    G = pd.DataFrame(
        np.array(G).T,
        index=indiv,
        columns=['c%d:%d' % (chromo, x) for x in range(len(rsids))],
    )

    bim = bed.get_bim().loc[rsids]
    mid = bim.pos.min() + (bim.pos.max() - bim.pos.min()) // 2
    eff_snps = np.where(bim.pos < mid - mid_buffer)[0]
    null_snps = np.where(bim.pos > mid + mid_buffer)[0]

    return G, eff_snps, null_snps, np.array(rsids)
Ejemplo n.º 2
0
class PlinkReader(GenotypesReader):
    def __init__(self, prefix):
        """Binary plink file reader.
        Args:
            prefix (str): the prefix of the Plink binary files.

        """
        self.bed = PyPlink(prefix)
        self.bim = self.bed.get_bim()
        self.fam = self.bed.get_fam()

        # Identify all multi-allelics.
        self.bim["multiallelic"] = False
        self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False),
                     "multiallelic"] = True

        # We want to set the index for the FAM file
        try:
            self.fam = self.fam.set_index("iid", verify_integrity=True)
        except ValueError:
            logger.info(
                "Setting the index as 'fid_iid' because the individual IDs "
                "are not unique.")

            self.fam["fid_iid"] = [
                "{fid}_{iid}".format(fid=fid, iid=iid)
                for fid, iid in zip(self.fam.fid, self.fam.iid)
            ]
            self.fam = self.fam.set_index("fid_iid", verify_integrity=True)

    def close(self):
        self.bed.close()

    def get_variant_genotypes(self, variant):
        """Get the genotypes from a well formed variant instance.

        Args:
            marker (Variant): A Variant instance.

        Returns:
            A list of Genotypes instance containing a pointer to the variant as
            well as a vector of encoded genotypes.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Find the variant in the bim.
        plink_chrom = CHROM_STR_TO_INT[variant.chrom]
        info = self.bim.loc[(self.bim.chrom == plink_chrom) &
                            (self.bim.pos == variant.pos), :]

        if info.shape[0] == 0:
            return []

        elif info.shape[0] == 1:
            return self._get_biallelic_variant(variant, info)

        else:
            return self._get_multialleic_variant(variant, info)

    def _get_biallelic_variant(self, variant, info, _check_alleles=True):
        # From 1.3.2 onwards, PyPlink sets unique names.
        info = info.iloc[0, :]
        variant_alleles = variant._encode_alleles([info.a2, info.a1])
        if (_check_alleles and variant_alleles != variant.alleles):
            # Variant with requested alleles is unavailable.
            return []

        geno = self._normalize_missing(self.bed.get_geno_marker(info.name))
        return [Genotypes(variant, geno, info.a2, info.a1, False)]

    def _get_multialleic_variant(self, variant, info):
        # Check if alleles are specified.
        out = []
        if variant.alleles is None:
            # If no alleles are specified, we return all the possible
            # bi-allelic variats.
            for name, row in info.iterrows():
                geno = self.bed.get_geno_marker(name)
                geno = self._normalize_missing(geno)
                out.append(Genotypes(variant, geno, row.a2, row.a1, True))

        else:
            # Find the requested alleles.
            for name, row in info.iterrows():
                row_alleles = set(Variant._encode_alleles((row.a1, row.a2)))
                if row_alleles.issubset(variant.alleles_set):
                    out.extend(
                        self._get_biallelic_variant(variant,
                                                    info.loc[[name], :],
                                                    _check_alleles=False))

        return out

    def iter_genotypes(self):
        """Iterates on available markers.

        Returns:
            Genotypes instances.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Iterating over all markers
        for i, (_, genotypes) in enumerate(self.bed.iter_geno()):
            info = self.bim.iloc[i, :]

            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(genotypes),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def iter_variants(self):
        """Iterate over marker information."""
        for idx, row in self.bim.iterrows():
            yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos,
                          [row.a1, row.a2])

    def get_variants_in_region(self, chrom, start, end):
        """Iterate over variants in a region."""
        bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom])
                           & (start <= self.bim["pos"]) &
                           (self.bim["pos"] <= end)]
        for i, g in enumerate(self.bed.iter_geno_marker(bim.index)):
            info = bim.iloc[i, :]
            name, geno = g
            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(geno),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def get_variant_by_name(self, name):
        """Get the genotype of a marker using it's name.

        Args:
            name (str): The name of the marker.

        Returns:
            list: A list of Genotypes (only one for PyPlink, see note below).

        Note
        ====
            From PyPlink version 1.3.2 and onwards, each name is unique in the
            dataset. Hence, we can use the 'get_geno_marker' function and be
            sure only one variant is returned.

        """
        # From 1.3.2 onwards, PyPlink sets unique names.
        # Getting the genotypes
        try:
            geno, i = self.bed.get_geno_marker(name, return_index=True)

        except ValueError:
            if name in self.bed.get_duplicated_markers():
                # The variant is a duplicated one, so we go through all the
                # variants with the same name and the :dupx suffix
                return [
                    self.get_variant_by_name(dup_name).pop()
                    for dup_name in self.bed.get_duplicated_markers()[name]
                ]

            else:
                # The variant is not in the BIM file, so we return an empty
                # list
                logger.warning("Variant {} was not found".format(name))
                return []

        else:
            info = self.bim.iloc[i, :]
            return [
                Genotypes(
                    Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos,
                            [info.a1, info.a2]),
                    self._normalize_missing(geno),
                    reference=info.a2,
                    coded=info.a1,
                    multiallelic=info.multiallelic,
                )
            ]

    def get_number_samples(self):
        """Returns the number of samples.
        Returns:
            int: The number of samples.
        """
        return self.bed.get_nb_samples()

    def get_number_variants(self):
        """Returns the number of markers.
        Returns:
            int: The number of markers.
        """
        return self.bed.get_nb_markers()

    def get_samples(self):
        return list(self.fam.index)

    @staticmethod
    def _normalize_missing(g):
        """Normalize a plink genotype vector."""
        g = g.astype(float)
        g[g == -1.0] = np.nan
        return g