Python PyPlink.get_fam Examples, pyplink.PyPlink.get_fam Python Examples

Example #1

0

Show file

def load_bed(geno_temp, chromo=1, indiv=None, mid_buffer=2e6):
    '''load bed-file and split in effect and null SNPs
    
    # Parameters:
    chromo (int): chromosome number
    indiv (None or np.ndarray):
                        if None, use all individuals in bed-file
                        if np.array, use elements as indivduals
    mid_buffer (int): number of bp up- and downstream of center of chromosome to leave out

    # Returns:
    G (pd.DataFrame): DataFrame with one indiv per row, one SNP per column
    eff_snps, null_snps (np.ndarray): array of ints with position of effect- and null-SNPs
    rsid (np.ndarray) array of str with rsid names of SNPs in G
    '''
    print(chromo)
    bed = PyPlink(geno_temp % chromo)
    print(bed.get_nb_markers())
    fam = bed.get_fam()
    if indiv is None:
        indiv = fam.iid.astype(int)

    ind = np.isin(fam.iid.astype(int), indiv)
    indiv = fam.loc[ind, 'iid'].astype(int).values

    G = []
    rsids = []
    removed = 0
    for g in tqdm(bed.iter_geno(), total=bed.get_nb_markers()):
        rs = g[0]
        gen = g[1][ind]
        g_ind = gen == -1
        if g_ind.mean() < 0.1:
            gen[g_ind] = gen[~g_ind].mean()
            G.append(gen)
            rsids.append(rs)
        else:
            removed += 1
    print(f'removed {removed} SNPs due to missing>10%')

    G = pd.DataFrame(
        np.array(G).T,
        index=indiv,
        columns=['c%d:%d' % (chromo, x) for x in range(len(rsids))],
    )

    bim = bed.get_bim().loc[rsids]
    mid = bim.pos.min() + (bim.pos.max() - bim.pos.min()) // 2
    eff_snps = np.where(bim.pos < mid - mid_buffer)[0]
    null_snps = np.where(bim.pos > mid + mid_buffer)[0]

    return G, eff_snps, null_snps, np.array(rsids)

Example #2

0

Show file

# In[17]:

plink = None
plink_bim = None
plink_fam = None

if args.bfile is not None:
    plink = PyPlink(args.bfile)
    plink_bim = plink.get_bim()
    plink_fam = plink.get_fam().astype({
        'fid': str,
        'iid': str
    }).rename(
        columns={
            'fid': 'FID',
            'iid': 'IID',
            'father': 'fID',
            'mother': 'mID',
            'gender': 'sex'
        })

    log.info("{} samples ({} males, {} females) loaded from {}".format(
        plink_fam.shape[0], (plink_fam['sex'] == 1).sum(),
        (plink_fam['sex'] == 2).sum(), args.bfile))
    log.info("{} unphased variants loaded from {}".format(
        plink_bim.shape[0], args.bfile))

# In[18]:

phased_FID_list = None

Example #3

0

Show file

import matplotlib.pyplot as plt

from pyplink import PyPlink

from basic_tools import *

# # load plink, aa and check integrity

# In[2]:

plink_KCHIP_HLA_AA_SNP_1000G = PyPlink(plink_KCHIP_HLA_AA_SNP_1000G_path)
plink_KCHIP_HLA_AA_SNP_1000G_fam = plink_KCHIP_HLA_AA_SNP_1000G.get_fam(
).astype({
    'fid': str,
    'iid': str
}).rename(columns={
    'fid': 'FID',
    'iid': 'IID'
})
plink_KCHIP_HLA_AA_SNP_1000G_bim = plink_KCHIP_HLA_AA_SNP_1000G.get_bim()

# In[3]:

grm_path = 'data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm'

# In[4]:

#final_plink_aa_grm_path

# # load phenotype and check integrity

Example #4

0

Show file

File: plink.py Project: rochefca/geneparse

class PlinkReader(GenotypesReader):
    def __init__(self, prefix):
        """Binary plink file reader.
        Args:
            prefix (str): the prefix of the Plink binary files.

        """
        self.bed = PyPlink(prefix)
        self.bim = self.bed.get_bim()
        self.fam = self.bed.get_fam()

        # Identify all multi-allelics.
        self.bim["multiallelic"] = False
        self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False),
                     "multiallelic"] = True

        # We want to set the index for the FAM file
        try:
            self.fam = self.fam.set_index("iid", verify_integrity=True)
        except ValueError:
            logger.info(
                "Setting the index as 'fid_iid' because the individual IDs "
                "are not unique.")

            self.fam["fid_iid"] = [
                "{fid}_{iid}".format(fid=fid, iid=iid)
                for fid, iid in zip(self.fam.fid, self.fam.iid)
            ]
            self.fam = self.fam.set_index("fid_iid", verify_integrity=True)

    def close(self):
        self.bed.close()

    def get_variant_genotypes(self, variant):
        """Get the genotypes from a well formed variant instance.

        Args:
            marker (Variant): A Variant instance.

        Returns:
            A list of Genotypes instance containing a pointer to the variant as
            well as a vector of encoded genotypes.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Find the variant in the bim.
        plink_chrom = CHROM_STR_TO_INT[variant.chrom]
        info = self.bim.loc[(self.bim.chrom == plink_chrom) &
                            (self.bim.pos == variant.pos), :]

        if info.shape[0] == 0:
            return []

        elif info.shape[0] == 1:
            return self._get_biallelic_variant(variant, info)

        else:
            return self._get_multialleic_variant(variant, info)

    def _get_biallelic_variant(self, variant, info, _check_alleles=True):
        # From 1.3.2 onwards, PyPlink sets unique names.
        info = info.iloc[0, :]
        variant_alleles = variant._encode_alleles([info.a2, info.a1])
        if (_check_alleles and variant_alleles != variant.alleles):
            # Variant with requested alleles is unavailable.
            return []

        geno = self._normalize_missing(self.bed.get_geno_marker(info.name))
        return [Genotypes(variant, geno, info.a2, info.a1, False)]

    def _get_multialleic_variant(self, variant, info):
        # Check if alleles are specified.
        out = []
        if variant.alleles is None:
            # If no alleles are specified, we return all the possible
            # bi-allelic variats.
            for name, row in info.iterrows():
                geno = self.bed.get_geno_marker(name)
                geno = self._normalize_missing(geno)
                out.append(Genotypes(variant, geno, row.a2, row.a1, True))

        else:
            # Find the requested alleles.
            for name, row in info.iterrows():
                row_alleles = set(Variant._encode_alleles((row.a1, row.a2)))
                if row_alleles.issubset(variant.alleles_set):
                    out.extend(
                        self._get_biallelic_variant(variant,
                                                    info.loc[[name], :],
                                                    _check_alleles=False))

        return out

    def iter_genotypes(self):
        """Iterates on available markers.

        Returns:
            Genotypes instances.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Iterating over all markers
        for i, (_, genotypes) in enumerate(self.bed.iter_geno()):
            info = self.bim.iloc[i, :]

            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(genotypes),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def iter_variants(self):
        """Iterate over marker information."""
        for idx, row in self.bim.iterrows():
            yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos,
                          [row.a1, row.a2])

    def get_variants_in_region(self, chrom, start, end):
        """Iterate over variants in a region."""
        bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom])
                           & (start <= self.bim["pos"]) &
                           (self.bim["pos"] <= end)]
        for i, g in enumerate(self.bed.iter_geno_marker(bim.index)):
            info = bim.iloc[i, :]
            name, geno = g
            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(geno),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def get_variant_by_name(self, name):
        """Get the genotype of a marker using it's name.

        Args:
            name (str): The name of the marker.

        Returns:
            list: A list of Genotypes (only one for PyPlink, see note below).

        Note
        ====
            From PyPlink version 1.3.2 and onwards, each name is unique in the
            dataset. Hence, we can use the 'get_geno_marker' function and be
            sure only one variant is returned.

        """
        # From 1.3.2 onwards, PyPlink sets unique names.
        # Getting the genotypes
        try:
            geno, i = self.bed.get_geno_marker(name, return_index=True)

        except ValueError:
            if name in self.bed.get_duplicated_markers():
                # The variant is a duplicated one, so we go through all the
                # variants with the same name and the :dupx suffix
                return [
                    self.get_variant_by_name(dup_name).pop()
                    for dup_name in self.bed.get_duplicated_markers()[name]
                ]

            else:
                # The variant is not in the BIM file, so we return an empty
                # list
                logger.warning("Variant {} was not found".format(name))
                return []

        else:
            info = self.bim.iloc[i, :]
            return [
                Genotypes(
                    Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos,
                            [info.a1, info.a2]),
                    self._normalize_missing(geno),
                    reference=info.a2,
                    coded=info.a1,
                    multiallelic=info.multiallelic,
                )
            ]

    def get_number_samples(self):
        """Returns the number of samples.
        Returns:
            int: The number of samples.
        """
        return self.bed.get_nb_samples()

    def get_number_variants(self):
        """Returns the number of markers.
        Returns:
            int: The number of markers.
        """
        return self.bed.get_nb_markers()

    def get_samples(self):
        return list(self.fam.index)

    @staticmethod
    def _normalize_missing(g):
        """Normalize a plink genotype vector."""
        g = g.astype(float)
        g[g == -1.0] = np.nan
        return g

Example #5

0

Show file

File: genotypes.py Project: rmporsch/simtools

class ReadPlink(object):

    """Reads plink files and allows random sampling"""

    def __init__(self, plinkstem):
        """
        :param plinkstem: plink stem file path

        """
        self._plinkstem = plinkstem
        self._bim_path = os.path.basename(self._plinkstem)+'.bim'
        self._bed_path = os.path.basename(self._plinkstem)+'.bed'
        self._fam_path = os.path.basename(self._plinkstem)+'.fam'

        self.plinkfile = PyPlink(self._plinkstem)
        self.fam = self.plinkfile.get_fam()
        self.bim = self.plinkfile.get_bim()
        self.N = self.fam.shape[0]
        self.P = self.bim.shape[0]
        self._sample_subjects = None
        self._sample_variants = None


    def sample(self, n, p, write_disk=False):
        """Samples from a plink file with random SNPs and subjects
        Currently pandas_plink does not support fancy indexing, hence
        sample will load the genotypes of all subjects before randomly sample
        subjects IDs.

        :param n: number of subjects to sample
        :param p: number of variants to sample
        :param write_disk: bool, write to disk a list of variants
        :returns: a numpy matrix of size n*p

        """
        self._sample_subjects = np.random.choice(self.fam.index.values, n, replace=True)
        self._sample_variants = np.random.choice(self.bim.index.values, p)

        if write_disk:
            self.bim.iloc[self._sample_variants].to_csv('sampled_variants.csv')
            self.fam.iloc[self._sample_subjects].to_csv('sampled_subjects.csv')

        genotypematrix = self.read_bed(self._sample_variants,
                                        self._sample_subjects)

        return genotypematrix

    def read_bed(self, marker=None, subjects=None):
        """read bed file

        :param marker: list of SNPs
        :param subjects: list of subjects
        :returns: genotypematrix of size subjects*marker

        """
        if marker is None:
            P_size = self.P
            marker = self.bim.index.values
        else:
            P_size = len(marker)

        if subjects is None:
            N_size = self.N
            subjects = self.fam.index.values
        else:
            N_size = len(subjects)

        genotypematrix = np.zeros((N_size, P_size), dtype=np.int8)

        j = 0
        for m, g in self.plinkfile.iter_geno_marker(marker):
            genotypematrix[:,j] = g[subjects]
            j += 1

        genotypematrix[genotypematrix < 0] = 0

        return genotypematrix

Example #6

0

Show file

    OUT_QUALIFIED_VARIANTS = True if args['--ov'] else False

    if len(mafs) == 0 and len(macs) == 0:
        sys.stderr.write(
            "At least one should be open: '--alt-frqs' and/or '--alt-acs'\n")
        sys.exit(-1)

    # Open and work on VCF file.
    # https://lemieuxl.github.io/pyplink/
    # API: https://lemieuxl.github.io/pyplink/pyplink.html
    # API demo: https://nbviewer.jupyter.org/github/lemieuxl/pyplink/blob/master/demo/PyPlink%20Demo.ipynb
    from pyplink import PyPlink
    # Getting the BED BIM FAM
    bed = PyPlink(plink_prefix)
    bim = bed.get_bim()
    fam = bed.get_fam()
    dup = bed.get_duplicated_markers()
    if dup:
        sys.stderr.write('ERROR: Duplicate markers found as above!!!\n')
        sys.exit(-1)

    snp_sets = set(bim.index)
    select_samples = [x in samples for x in fam.loc[:, 'iid']
                      ]  # True/false array for sample we want to keep.

    # print(vcf.samples)
    out = "#CHROM  BEGIN   END     MARKER_ID       NUM_ALL_VARS    NUM_PASS_VARS   NUM_SING_VARS MAF/MAC_CUT".split(
    )
    sys.stdout.write(
        '%s\t%s\n' %
        ('\t'.join(out), '\t'.join(fam.loc[select_samples]['iid'])))

Example #7

0

Show file

File: genereader.py Project: rmporsch/pyksburden

class GeneReader(object):

    def __init__(self, plink_path: str, pheno_path: str, variant_path: str):
        lg.debug("""
        Loading the following files:
        Plink: %s Pheno %s Variants: %s
         """, plink_path, pheno_path, variant_path)
        assert os.path.isfile(plink_path+'.bed')
        assert os.path.isfile(variant_path)
        self.plink_path = plink_path
        self.variant_path = variant_path
        self.bfile = PyPlink(self.plink_path)
        self.bim = self.bfile.get_bim()
        self.bim['rsid'] = self.bim.index.values
        self.fam = self.bfile.get_fam()
        self.n_chrom = self.bim.chrom.nunique()
        self.variants = self._get_var(self.variant_path)
        self.genes = self.variants.gene.unique()
        self.pheno = self._get_pheno(pheno_path)
        self.bfile.close()

    def _get_var(self, variant_path: str) -> pd.DataFrame:
        dat = pd.read_table(variant_path, header=None)
        lg.debug(dat.head())
        nrow, ncol = dat.shape
        assert ncol == 4
        assert nrow > 3
        dat.columns = ['chrom', 'pos', 'rsid', 'gene']
        n_chrom = dat.chrom.nunique()
        chromosomes = dat.chrom.unique()
        n_genes = dat.gene.nunique()
        lg.info('Got %s genes in variant file', n_genes)
        lg.info('Got %s variants in variant file', nrow)
        lg.debug('Chromosomes: %s', n_chrom)
        chrom_check = [k for k in chromosomes if k in self.bim.chrom.unique()]
        lg.info('Found %s out of %s chromosomes in bim file',
                len(chrom_check), self.n_chrom)
        lg.debug(self.bim.head())
        dat = pd.merge(dat, self.bim, on=['pos', 'chrom', 'rsid'],
                       how='inner')
        n_var = dat.shape[0]
        lg.info('After merging with the bim file there are %s variants left',
                n_var)
        if n_var < nrow:
            lg.warning('After merging I lost %s variants',
                       nrow - n_var)
        return dat

    def _get_pheno(self, pheno_file: str) -> pd.DataFrame:
        dat = pd.read_table(pheno_file, header=None)
        nrow, ncol = dat.shape
        assert ncol >= 3
        assert nrow > 1
        lg.debug(dat.head())
        if ncol == 3:
            dat.columns = ['fid', 'iid', 'Pheno']
            dat['fid'] = dat['fid'].astype(str)
            dat['iid'] = dat['fid'].astype(str)
        elif ncol == 6:
            dat.columns = ['fid', 'iid', 'father', 'mother', 'gender', 'Pheno']
            dat['fid'] = dat['fid'].astype(str)
            dat['iid'] = dat['fid'].astype(str)
        else:
            raise ValueError('Need at either a 3 or 6 column file')
        lg.debug(self.fam.head())
        dat = pd.merge(self.fam, dat, on=['fid', 'iid'])
        self.n = dat.shape[0]
        lg.info('Using %s out of %s samples', self.n, nrow)
        if self.n < nrow:
            lg.warning('%s samples not in fam file', (nrow - self.n))
            if self.n < 2:
                raise AssertionError('Sample size is smaller than 2.')
        self.case_controls = (dat.Pheno > 0).values
        lg.info('Found %s cases and %s controls',
                np.sum(self.case_controls), np.sum(~self.case_controls))
        return dat

    def _read_gene(self, gene: str) -> np.array:
        temp = self.variants[self.variants.gene == gene]
        chrom = temp.chrom.unique()
        assert len(chrom) == 1
        lg.debug(temp.head())
        marker = temp.rsid.values
        lg.debug(marker)
        p = len(marker)
        assert p > 3
        genotype_matrix = np.zeros((self.n, p))
        reader = PyPlink(self.plink_path)
        u = 0
        lg.info('Reading %s', gene)
        for i, g in reader.iter_geno_marker(marker):
            genotype_matrix[:, u] = g
            u += 1
            lg.debug('Processed variant %s', i)
        genotype_matrix[genotype_matrix == -1] = 0
        reader.close()
        return genotype_matrix

    def gene_iterator(self, genes=None) -> np.array:
        if genes is None:
            genes = self.genes
        for gene_name in genes:
            lg.debug('Getting gene %s', gene_name)
            yield self._read_gene(gene_name)