Exemple #1
0
    def __init__(self, prefix):
        """Binary plink file reader.
        Args:
            prefix (str): the prefix of the Plink binary files.

        """
        self.bed = PyPlink(prefix)
        self.bim = self.bed.get_bim()
        self.fam = self.bed.get_fam()

        # Identify all multi-allelics.
        self.bim["multiallelic"] = False
        self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False),
                     "multiallelic"] = True

        # We want to set the index for the FAM file
        try:
            self.fam = self.fam.set_index("iid", verify_integrity=True)
        except ValueError:
            logger.info(
                "Setting the index as 'fid_iid' because the individual IDs "
                "are not unique.")

            self.fam["fid_iid"] = [
                "{fid}_{iid}".format(fid=fid, iid=iid)
                for fid, iid in zip(self.fam.fid, self.fam.iid)
            ]
            self.fam = self.fam.set_index("fid_iid", verify_integrity=True)
Exemple #2
0
    def __init__(self, plinkstem):
        """
        :param plinkstem: plink stem file path

        """
        self._plinkstem = plinkstem
        self._bim_path = os.path.basename(self._plinkstem)+'.bim'
        self._bed_path = os.path.basename(self._plinkstem)+'.bed'
        self._fam_path = os.path.basename(self._plinkstem)+'.fam'

        self.plinkfile = PyPlink(self._plinkstem)
        self.fam = self.plinkfile.get_fam()
        self.bim = self.plinkfile.get_bim()
        self.N = self.fam.shape[0]
        self.P = self.bim.shape[0]
        self._sample_subjects = None
        self._sample_variants = None
Exemple #3
0
 def __init__(self, plink_path: str, pheno_path: str, variant_path: str):
     lg.debug("""
     Loading the following files:
     Plink: %s Pheno %s Variants: %s
      """, plink_path, pheno_path, variant_path)
     assert os.path.isfile(plink_path+'.bed')
     assert os.path.isfile(variant_path)
     self.plink_path = plink_path
     self.variant_path = variant_path
     self.bfile = PyPlink(self.plink_path)
     self.bim = self.bfile.get_bim()
     self.bim['rsid'] = self.bim.index.values
     self.fam = self.bfile.get_fam()
     self.n_chrom = self.bim.chrom.nunique()
     self.variants = self._get_var(self.variant_path)
     self.genes = self.variants.gene.unique()
     self.pheno = self._get_pheno(pheno_path)
     self.bfile.close()
Exemple #4
0
 def _read_gene(self, gene: str) -> np.array:
     temp = self.variants[self.variants.gene == gene]
     chrom = temp.chrom.unique()
     assert len(chrom) == 1
     lg.debug(temp.head())
     marker = temp.rsid.values
     lg.debug(marker)
     p = len(marker)
     assert p > 3
     genotype_matrix = np.zeros((self.n, p))
     reader = PyPlink(self.plink_path)
     u = 0
     lg.info('Reading %s', gene)
     for i, g in reader.iter_geno_marker(marker):
         genotype_matrix[:, u] = g
         u += 1
         lg.debug('Processed variant %s', i)
     genotype_matrix[genotype_matrix == -1] = 0
     reader.close()
     return genotype_matrix
Exemple #5
0
def load_bed(geno_temp, chromo=1, indiv=None, mid_buffer=2e6):
    '''load bed-file and split in effect and null SNPs
    
    # Parameters:
    chromo (int): chromosome number
    indiv (None or np.ndarray):
                        if None, use all individuals in bed-file
                        if np.array, use elements as indivduals
    mid_buffer (int): number of bp up- and downstream of center of chromosome to leave out

    # Returns:
    G (pd.DataFrame): DataFrame with one indiv per row, one SNP per column
    eff_snps, null_snps (np.ndarray): array of ints with position of effect- and null-SNPs
    rsid (np.ndarray) array of str with rsid names of SNPs in G
    '''
    print(chromo)
    bed = PyPlink(geno_temp % chromo)
    print(bed.get_nb_markers())
    fam = bed.get_fam()
    if indiv is None:
        indiv = fam.iid.astype(int)

    ind = np.isin(fam.iid.astype(int), indiv)
    indiv = fam.loc[ind, 'iid'].astype(int).values

    G = []
    rsids = []
    removed = 0
    for g in tqdm(bed.iter_geno(), total=bed.get_nb_markers()):
        rs = g[0]
        gen = g[1][ind]
        g_ind = gen == -1
        if g_ind.mean() < 0.1:
            gen[g_ind] = gen[~g_ind].mean()
            G.append(gen)
            rsids.append(rs)
        else:
            removed += 1
    print(f'removed {removed} SNPs due to missing>10%')

    G = pd.DataFrame(
        np.array(G).T,
        index=indiv,
        columns=['c%d:%d' % (chromo, x) for x in range(len(rsids))],
    )

    bim = bed.get_bim().loc[rsids]
    mid = bim.pos.min() + (bim.pos.max() - bim.pos.min()) // 2
    eff_snps = np.where(bim.pos < mid - mid_buffer)[0]
    null_snps = np.where(bim.pos > mid + mid_buffer)[0]

    return G, eff_snps, null_snps, np.array(rsids)
Exemple #6
0
    def setUpClass(cls):
        # Loading the data
        data = pd.read_csv(
            resource_filename(__name__, "data/statistics/linear.txt.bz2"),
            sep="\t",
            compression="bz2",
        )

        # Creating the index
        data["sample"] = ["s{}".format(i + 1) for i in range(data.shape[0])]
        data = data.set_index("sample")

        # Creating the dummy phenotype container
        cls.phenotypes = _DummyPhenotypes()
        cls.phenotypes.data = data.drop(
            ["snp{}".format(i + 1) for i in range(5)],
            axis=1,
        )

        # Creating a temporary directory
        cls.tmp_dir = TemporaryDirectory(prefix="genetest_test_linear_")

        # The plink file prefix
        cls.plink_prefix = os.path.join(cls.tmp_dir.name, "input")

        # Permuting the sample to add a bit of randomness
        new_sample_order = np.random.permutation(data.index)

        # Creating the BED file
        with PyPlink(cls.plink_prefix, "w") as bed:
            for snp in [s for s in data.columns if s.startswith("snp")]:
                bed.write_genotypes(data.loc[new_sample_order, snp])

        # Creating the BIM file
        with open(cls.plink_prefix + ".bim", "w") as bim:
            print(3, "snp1", 0, 1234, "T", "C", sep="\t", file=bim)
            print(3, "snp2", 0, 9618, "C", "A", sep="\t", file=bim)
            print(2, "snp3", 0, 1519, "G", "T", sep="\t", file=bim)
            print(1, "snp4", 0, 5871, "G", "A", sep="\t", file=bim)
            print(23, "snp5", 0, 2938, "T", "C", sep="\t", file=bim)

        # Creating the FAM file
        with open(cls.plink_prefix + ".fam", "w") as fam:
            for sample in new_sample_order:
                print(sample, sample, 0, 0, 0, -9, file=fam)

        # Creating the genotype parser
        cls.genotypes = parsers["plink"](cls.plink_prefix)
def _get_matrix(pfile, max_block):
    """Extract a genotype matrix from plink file."""
    with PyPlink(pfile) as bed:
        bim = bed.get_bim()
        fam = bed.get_fam()
        n = fam.shape[0]
        p = bim.shape[0]
        assert (max_block <= p)
        genotypemat = np.zeros((n, max_block), dtype=np.int64)
        u = 0
        for loci_name, genotypes in bed:
            genotypemat[:, u] = np.array(genotypes)
            u += 1
            if u >= max_block:
                break
        return genotypemat
Exemple #8
0
    def setUpClass(cls):
        # Creating random data
        cls.data = pd.DataFrame(
            dict(
                pheno=np.random.randint(1, 100, 100),
                var1=np.random.randint(1, 100, 100),
                var2=np.random.rand(100),
                var3=["x{}".format(i) for i in np.random.randint(0, 3, 100)],
                var4=["y{}".format(i) for i in np.random.randint(0, 2, 100)],
                var5=np.random.randint(0, 4, 100),
                snp=binom.rvs(2, 0.3, size=100),
            ),
            index=["sample_{}".format(i + 1) for i in range(100)],
        )

        # Changing one factor to categorical data
        cls.data.loc[:, "var5"] = cls.data.var5.astype("category")

        # Creating the dummy phenotype container
        phenotypes = ["pheno"] + ["var{}".format(i + 1) for i in range(5)]
        cls.phenotypes = _DummyPhenotypes()
        cls.phenotypes.data = cls.data[phenotypes].copy()

        # Creating a temporary directory
        cls.tmp_dir = TemporaryDirectory(prefix="genetest_")

        # The plink file prefix
        cls.plink_prefix = path.join(cls.tmp_dir.name, "input")

        # Permuting the sample to add a bit of randomness
        new_sample_order = np.random.permutation(cls.data.index)

        # Creating the BED file
        with PyPlink(cls.plink_prefix, "w") as bed:
            bed.write_genotypes(cls.data.loc[new_sample_order, "snp"])

        # Creating the BIM file
        with open(cls.plink_prefix + ".bim", "w") as bim:
            print(1, "snp", 0, 1, "B", "A", sep="\t", file=bim)

        # Creating the FAM file
        with open(cls.plink_prefix + ".fam", "w") as fam:
            for sample in new_sample_order:
                print(sample, sample, 0, 0, 0, -9, file=fam)

        # Creating the genotype parser
        cls.genotypes = parsers["plink"](cls.plink_prefix)
Exemple #9
0
    def setUpClass(cls):
        # Loading the data
        data = pd.read_csv(
            resource_filename(__name__, "data/statistics/factors.txt.bz2"),
            sep="\t",
            compression="bz2",
        ).set_index("sample_id")

        # Creating the dummy phenotype container
        cls.phenotypes = _DummyPhenotypes()
        cls.phenotypes.data = data.drop(
            [col for col in data.columns if col.startswith("snp")],
            axis=1,
        )

        # Creating a temporary directory
        cls.tmp_dir = TemporaryDirectory(prefix="genetest_test_linear_")

        # The plink file prefix
        cls.plink_prefix = os.path.join(cls.tmp_dir.name, "input")

        # Permuting the sample to add a bit of randomness
        new_sample_order = np.random.permutation(data.index)

        # Creating the BED file
        with PyPlink(cls.plink_prefix, "w") as bed:
            for i in range(3):
                snp = "snp{}".format(i + 1)
                bed.write_genotypes(data.loc[new_sample_order, snp])

        # Creating the BIM file
        with open(cls.plink_prefix + ".bim", "w") as bim:
            print(1, "snp1", 0, 1, "B", "A", sep="\t", file=bim)
            print(1, "snp2", 0, 2, "B", "A", sep="\t", file=bim)
            print(1, "snp3", 0, 3, "B", "A", sep="\t", file=bim)

        # Creating the FAM file
        with open(cls.plink_prefix + ".fam", "w") as fam:
            for sample in new_sample_order:
                print(sample, sample, 0, 0, 0, -9, file=fam)
Exemple #10
0
def get_genotypes(rsid, plink_path, sub_in):
    """
    Retrive genotype matrix from variant major format

    :param rsid: list of rsids
    :param plink_path: plink-stem path
    :param sub_in: list of subjects to inlucde
    :return: genotypematrix
    """
    reader = PyPlink(plink_path)
    lg.debug('First item of sub_in is %s with %s', sub_in[0], type(sub_in[0]))
    n = reader.get_nb_samples()
    genotypematrix = np.zeros((sum(sub_in), len(rsid)), dtype=np.int8)
    pos_index = 0
    for snp, genotype in reader.iter_geno_marker(rsid):
        if snp not in rsid:
            continue
        else:
            genotypematrix[:, pos_index] = genotype[sub_in]
            pos_index += 1
    reader.close()
    return genotypematrix
Exemple #11
0
    """Creates probabilities from an additive genotype."""
    if genotype == 0:
        return (1, 0, 0)

    if genotype == 1:
        return (0, 1, 0)

    if genotype == 2:
        return (0, 0, 1)

    if genotype == -1:
        # Lower than normal probabilities
        return (0.8, 0.1, 0.1)


with PyPlink("../plink/btest") as bed, \
        open("impute2_test.impute2", "w") as impute2_f, \
        open("impute2_test.sample", "w") as impute2_s:
    # Getting the FAM and the BIM
    fam = bed.get_fam()
    bim = bed.get_bim()

    # Generating the IMPUTE2 file
    for v, genotypes in bed.iter_geno():
        info = bim.loc[v, :]
        assert v == info.name

        r = re.search(r"(:dup[0-9]+)$", v)
        if r:
            v = v.replace(r.group(1), "")
Exemple #12
0
out = args.out

# In[16]:

log.info_head("Data Loading")

# # parse input files

# In[17]:

plink = None
plink_bim = None
plink_fam = None

if args.bfile is not None:
    plink = PyPlink(args.bfile)
    plink_bim = plink.get_bim()
    plink_fam = plink.get_fam().astype({
        'fid': str,
        'iid': str
    }).rename(
        columns={
            'fid': 'FID',
            'iid': 'IID',
            'father': 'fID',
            'mother': 'mID',
            'gender': 'sex'
        })

    log.info("{} samples ({} males, {} females) loaded from {}".format(
        plink_fam.shape[0], (plink_fam['sex'] == 1).sum(),
def convert_beeline(i_filenames, out_dir, locations, other_opts):
    """Convert beeline report(s) to Plink files.

    Args:
        i_filenames (list): a list of file names (str)
        out_dir (str): the name of the output directory
        locations (dict): a dictionary from marker ID to genomic location
        other_opts(argparse.Namespace): the program options

    """
    # The samples that were already seen
    seen_samples = set()

    for i_filename in i_filenames:
        logging.info("Converting '{}'".format(i_filename))

        # Getting the output filename
        o_filename = os.path.splitext(os.path.basename(i_filename))[0]

        # Opening the file
        i_file = None
        if i_filename != "-":
            i_file = open(i_filename, "r")

        else:
            i_file = sys.stdin
            o_filename = "from_stdin"

        # The output files
        prefix = os.path.join(out_dir, o_filename)
        pedfile = None
        mapfile = None
        bedfile = None
        bimfile = None
        famfile = None
        sample_file = None
        sample_file_end = None
        sample_file_sep = None
        if other_opts.o_format == "ped":
            pedfile = open(prefix + ".ped", "w")
            mapfile = open(prefix + ".map", "w")
            sample_file = pedfile
            sample_file_end = ""
            sample_file_sep = "\t"
        elif other_opts.o_format == "bed":
            bedfile = PyPlink(prefix, "w", "INDIVIDUAL-major")
            bimfile = open(prefix + ".bim", "w")
            famfile = open(prefix + ".fam", "w")
            sample_file = famfile
            sample_file_end = "\n"
            sample_file_sep = " "

        # Reading the file (or STDIN)
        try:
            # The number of markers
            nb_markers = None

            # Reading the assay information
            line = i_file.readline().rstrip("\r\n")
            while line != "[Header]":
                line = i_file.readline().rstrip("\r\n")

            while not line.startswith("[Data]"):
                if line.startswith(other_opts.nb_snps_kw):
                    nb_markers = int(line.rstrip("\r\n").split(",")[-1])
                line = i_file.readline()

            if nb_markers is None:
                raise ProgramError(
                    "{}: invalid header (missing '{}' value)".format(
                        i_filename,
                        other_opts.nb_snps_kw,
                    )
                )

            logging.info("There are {:,d} markers".format(nb_markers))

            # Reading and checking the header
            header = {
                name: i
                for i, name in
                enumerate(i_file.readline().rstrip("\r\n").split(","))
            }
            required_columns = (
                other_opts.beeline_id, other_opts.beeline_sample,
                other_opts.beeline_a1, other_opts.beeline_a2,
            )
            for name in required_columns:
                if name not in header:
                    raise ProgramError(
                        "{}: '{}': missing column".format(i_filename, name),
                    )

            # Creating the list that will contain the marker list
            current_marker_i = 0
            all_markers = [None for i in range(nb_markers)]
            nb_samples = 0

            # The genotypes (if in BED format)
            genotypes = None
            if other_opts.o_format == "bed":
                genotypes = [-1 for i in range(nb_markers)]

            # Reading the first data line
            line = i_file.readline()
            row = line.rstrip("\r\n").split(",")

            # Allele of each markers
            marker_alleles = {}

            while line != "":
                # Getting the marker name and sample id
                sample = row[header[other_opts.beeline_sample]]
                if sample in seen_samples:
                    logging.warning("{}: duplicate sample "
                                    "found".format(sample))
                seen_samples.add(sample)

                # Logging
                logging.info("Processing {}".format(sample))
                print(sample, sample, "0", "0", "0", "-9", sep=sample_file_sep,
                      end=sample_file_end, file=sample_file)

                # Reading the rest of the data for this sample
                current_sample = sample
                while current_sample == sample:
                    # Checking the marker order
                    marker = row[header[other_opts.beeline_id]]

                    # If the index is > than the length, it might be a
                    # duplicated sample...
                    if current_marker_i == len(all_markers):
                        break

                    if all_markers[current_marker_i] is None:
                        all_markers[current_marker_i] = marker
                    if all_markers[current_marker_i] != marker:
                        raise ProgramError(
                            "{}: marker order is not the same for "
                            "sample '{}'".format(i_filename, sample)
                        )

                    # Getting the genotype
                    allele_1 = row[header[other_opts.beeline_a1]]
                    allele_2 = row[header[other_opts.beeline_a2]]
                    genotype = "{} {}".format(allele_1, allele_2)
                    if "-" in genotype:
                        genotype = "0 0"

                    if other_opts.o_format == "ped":
                        pedfile.write("\t" + genotype)
                    else:
                        if marker not in locations:
                            raise ProgramError(
                                "{}: no mapping information".format(marker)
                            )

                        # Is this the first time we have seen this marker?
                        if marker not in marker_alleles:
                            marker_alleles[marker] = locations[marker].alleles

                        # Computing the genotypes
                        genotypes[current_marker_i] = encode_genotype(
                            allele_1, allele_2, marker_alleles[marker],
                        )

                    # Increasing the current marker
                    current_marker_i += 1

                    # Reading the next row
                    line = i_file.readline()
                    if line == "":
                        # End of file
                        break

                    # Splitting and current sample
                    row = line.rstrip("\r\n").split(",")
                    current_sample = row[header[other_opts.beeline_sample]]

                if other_opts.o_format == "ped":
                    pedfile.write("\n")
                else:
                    bedfile.write_genotypes(genotypes)

                # If there is only one marker, there is a problem
                if nb_markers != 1 and current_marker_i == 1:
                    raise ProgramError(
                        "{}: data should be sorted by samples, not by "
                        "markers ('{}' had 1 marker, expecting "
                        "{:,d}".format(i_filename, sample, nb_markers)
                    )

                # Are there any missing marker?
                if current_marker_i != nb_markers:
                    nb_missing = abs(current_marker_i - nb_markers)
                    raise ProgramError(
                        "{}: missing {} marker{} for sample '{}'".format(
                            i_filename,
                            nb_missing,
                            "s" if nb_missing > 1 else "",
                            sample,
                        )
                    )
                current_marker_i = 0
                nb_samples += 1

            # Closing the output files
            logging.info("Done writing {:,d} samples".format(nb_samples))

            # Printing the map file
            for marker in all_markers:
                marker_location = None
                if marker in locations:
                    marker_location = locations[marker]
                else:
                    marker_location = _unknown_location
                    logging.warning("{}: no mapping "
                                    "information".format(marker))
                if other_opts.o_format == "ped":
                    print(marker_location.chrom, marker, "0",
                          marker_location.pos, sep="\t", file=mapfile)
                else:
                    alleles = {
                        v: k for k, v in marker_alleles[marker].items()
                    }
                    print(marker_location.chrom, marker, "0",
                          marker_location.pos, alleles["B"], alleles["A"],
                          sep="\t", file=bimfile)

        finally:
            # Closing the input file
            i_file.close()

            # Closing the output files
            for f in (pedfile, mapfile, bedfile, bimfile, famfile):
                if f is not None:
                    f.close()
Exemple #14
0
class GeneReader(object):

    def __init__(self, plink_path: str, pheno_path: str, variant_path: str):
        lg.debug("""
        Loading the following files:
        Plink: %s Pheno %s Variants: %s
         """, plink_path, pheno_path, variant_path)
        assert os.path.isfile(plink_path+'.bed')
        assert os.path.isfile(variant_path)
        self.plink_path = plink_path
        self.variant_path = variant_path
        self.bfile = PyPlink(self.plink_path)
        self.bim = self.bfile.get_bim()
        self.bim['rsid'] = self.bim.index.values
        self.fam = self.bfile.get_fam()
        self.n_chrom = self.bim.chrom.nunique()
        self.variants = self._get_var(self.variant_path)
        self.genes = self.variants.gene.unique()
        self.pheno = self._get_pheno(pheno_path)
        self.bfile.close()

    def _get_var(self, variant_path: str) -> pd.DataFrame:
        dat = pd.read_table(variant_path, header=None)
        lg.debug(dat.head())
        nrow, ncol = dat.shape
        assert ncol == 4
        assert nrow > 3
        dat.columns = ['chrom', 'pos', 'rsid', 'gene']
        n_chrom = dat.chrom.nunique()
        chromosomes = dat.chrom.unique()
        n_genes = dat.gene.nunique()
        lg.info('Got %s genes in variant file', n_genes)
        lg.info('Got %s variants in variant file', nrow)
        lg.debug('Chromosomes: %s', n_chrom)
        chrom_check = [k for k in chromosomes if k in self.bim.chrom.unique()]
        lg.info('Found %s out of %s chromosomes in bim file',
                len(chrom_check), self.n_chrom)
        lg.debug(self.bim.head())
        dat = pd.merge(dat, self.bim, on=['pos', 'chrom', 'rsid'],
                       how='inner')
        n_var = dat.shape[0]
        lg.info('After merging with the bim file there are %s variants left',
                n_var)
        if n_var < nrow:
            lg.warning('After merging I lost %s variants',
                       nrow - n_var)
        return dat

    def _get_pheno(self, pheno_file: str) -> pd.DataFrame:
        dat = pd.read_table(pheno_file, header=None)
        nrow, ncol = dat.shape
        assert ncol >= 3
        assert nrow > 1
        lg.debug(dat.head())
        if ncol == 3:
            dat.columns = ['fid', 'iid', 'Pheno']
            dat['fid'] = dat['fid'].astype(str)
            dat['iid'] = dat['fid'].astype(str)
        elif ncol == 6:
            dat.columns = ['fid', 'iid', 'father', 'mother', 'gender', 'Pheno']
            dat['fid'] = dat['fid'].astype(str)
            dat['iid'] = dat['fid'].astype(str)
        else:
            raise ValueError('Need at either a 3 or 6 column file')
        lg.debug(self.fam.head())
        dat = pd.merge(self.fam, dat, on=['fid', 'iid'])
        self.n = dat.shape[0]
        lg.info('Using %s out of %s samples', self.n, nrow)
        if self.n < nrow:
            lg.warning('%s samples not in fam file', (nrow - self.n))
            if self.n < 2:
                raise AssertionError('Sample size is smaller than 2.')
        self.case_controls = (dat.Pheno > 0).values
        lg.info('Found %s cases and %s controls',
                np.sum(self.case_controls), np.sum(~self.case_controls))
        return dat

    def _read_gene(self, gene: str) -> np.array:
        temp = self.variants[self.variants.gene == gene]
        chrom = temp.chrom.unique()
        assert len(chrom) == 1
        lg.debug(temp.head())
        marker = temp.rsid.values
        lg.debug(marker)
        p = len(marker)
        assert p > 3
        genotype_matrix = np.zeros((self.n, p))
        reader = PyPlink(self.plink_path)
        u = 0
        lg.info('Reading %s', gene)
        for i, g in reader.iter_geno_marker(marker):
            genotype_matrix[:, u] = g
            u += 1
            lg.debug('Processed variant %s', i)
        genotype_matrix[genotype_matrix == -1] = 0
        reader.close()
        return genotype_matrix

    def gene_iterator(self, genes=None) -> np.array:
        if genes is None:
            genes = self.genes
        for gene_name in genes:
            lg.debug('Getting gene %s', gene_name)
            yield self._read_gene(gene_name)
Exemple #15
0
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

import matplotlib.pyplot as plt

from pyplink import PyPlink

from basic_tools import *

# # load plink, aa and check integrity

# In[2]:

plink_KCHIP_HLA_AA_SNP_1000G = PyPlink(plink_KCHIP_HLA_AA_SNP_1000G_path)
plink_KCHIP_HLA_AA_SNP_1000G_fam = plink_KCHIP_HLA_AA_SNP_1000G.get_fam(
).astype({
    'fid': str,
    'iid': str
}).rename(columns={
    'fid': 'FID',
    'iid': 'IID'
})
plink_KCHIP_HLA_AA_SNP_1000G_bim = plink_KCHIP_HLA_AA_SNP_1000G.get_bim()

# In[3]:

grm_path = 'data/genotype/4_merge/KCHIP_HLA_AA_SNP_1000G.grm'

# In[4]:
for i in {00..10};do python 5_1_association.py $i 1 0;done
for i in {11..20};do python 5_1_association.py $i 1 0;done
for i in {21..30};do python 5_1_association.py $i 1 0;done
for i in {31..40};do python 5_1_association.py $i 1 0;done
for i in {41..50};do python 5_1_association.py $i 1 0;done
for i in {51..60};do python 5_1_association.py $i 1 0;done
for i in {61..70};do python 5_1_association.py $i 1 0;done
for i in {71..80};do python 5_1_association.py $i 1 0;done
for i in {81..90};do python 5_1_association.py $i 1 0;done
for i in {91..97};do python 5_1_association.py $i 1 0;done
"""

# In[2]:

plink_KCHIP_HLA_AA_SNP_1000G = PyPlink(plink_KCHIP_HLA_AA_SNP_1000G_path)
plink_KCHIP_HLA_AA_SNP_1000G_fam = plink_KCHIP_HLA_AA_SNP_1000G.get_fam(
).astype({
    'fid': str,
    'iid': str
}).rename(columns={
    'fid': 'FID',
    'iid': 'IID'
})
plink_KCHIP_HLA_AA_SNP_1000G_bim = plink_KCHIP_HLA_AA_SNP_1000G.get_bim()

# In[3]:

#len(binary_continuous_traits)

# In[4]:
Exemple #17
0
class PlinkReader(GenotypesReader):
    def __init__(self, prefix):
        """Binary plink file reader.
        Args:
            prefix (str): the prefix of the Plink binary files.

        """
        self.bed = PyPlink(prefix)
        self.bim = self.bed.get_bim()
        self.fam = self.bed.get_fam()

        # Identify all multi-allelics.
        self.bim["multiallelic"] = False
        self.bim.loc[self.bim.duplicated(["chrom", "pos"], keep=False),
                     "multiallelic"] = True

        # We want to set the index for the FAM file
        try:
            self.fam = self.fam.set_index("iid", verify_integrity=True)
        except ValueError:
            logger.info(
                "Setting the index as 'fid_iid' because the individual IDs "
                "are not unique.")

            self.fam["fid_iid"] = [
                "{fid}_{iid}".format(fid=fid, iid=iid)
                for fid, iid in zip(self.fam.fid, self.fam.iid)
            ]
            self.fam = self.fam.set_index("fid_iid", verify_integrity=True)

    def close(self):
        self.bed.close()

    def get_variant_genotypes(self, variant):
        """Get the genotypes from a well formed variant instance.

        Args:
            marker (Variant): A Variant instance.

        Returns:
            A list of Genotypes instance containing a pointer to the variant as
            well as a vector of encoded genotypes.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Find the variant in the bim.
        plink_chrom = CHROM_STR_TO_INT[variant.chrom]
        info = self.bim.loc[(self.bim.chrom == plink_chrom) &
                            (self.bim.pos == variant.pos), :]

        if info.shape[0] == 0:
            return []

        elif info.shape[0] == 1:
            return self._get_biallelic_variant(variant, info)

        else:
            return self._get_multialleic_variant(variant, info)

    def _get_biallelic_variant(self, variant, info, _check_alleles=True):
        # From 1.3.2 onwards, PyPlink sets unique names.
        info = info.iloc[0, :]
        variant_alleles = variant._encode_alleles([info.a2, info.a1])
        if (_check_alleles and variant_alleles != variant.alleles):
            # Variant with requested alleles is unavailable.
            return []

        geno = self._normalize_missing(self.bed.get_geno_marker(info.name))
        return [Genotypes(variant, geno, info.a2, info.a1, False)]

    def _get_multialleic_variant(self, variant, info):
        # Check if alleles are specified.
        out = []
        if variant.alleles is None:
            # If no alleles are specified, we return all the possible
            # bi-allelic variats.
            for name, row in info.iterrows():
                geno = self.bed.get_geno_marker(name)
                geno = self._normalize_missing(geno)
                out.append(Genotypes(variant, geno, row.a2, row.a1, True))

        else:
            # Find the requested alleles.
            for name, row in info.iterrows():
                row_alleles = set(Variant._encode_alleles((row.a1, row.a2)))
                if row_alleles.issubset(variant.alleles_set):
                    out.extend(
                        self._get_biallelic_variant(variant,
                                                    info.loc[[name], :],
                                                    _check_alleles=False))

        return out

    def iter_genotypes(self):
        """Iterates on available markers.

        Returns:
            Genotypes instances.

        Note
        ====
            If the sample IDs are not unique, the index is changed to be the
            sample family ID and individual ID (i.e. fid_iid).

        """
        # Iterating over all markers
        for i, (_, genotypes) in enumerate(self.bed.iter_geno()):
            info = self.bim.iloc[i, :]

            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(genotypes),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def iter_variants(self):
        """Iterate over marker information."""
        for idx, row in self.bim.iterrows():
            yield Variant(row.name, CHROM_INT_TO_STR[row.chrom], row.pos,
                          [row.a1, row.a2])

    def get_variants_in_region(self, chrom, start, end):
        """Iterate over variants in a region."""
        bim = self.bim.loc[(self.bim["chrom"] == CHROM_STR_TO_INT[chrom])
                           & (start <= self.bim["pos"]) &
                           (self.bim["pos"] <= end)]
        for i, g in enumerate(self.bed.iter_geno_marker(bim.index)):
            info = bim.iloc[i, :]
            name, geno = g
            yield Genotypes(Variant(info.name, CHROM_INT_TO_STR[info.chrom],
                                    info.pos, [info.a1, info.a2]),
                            self._normalize_missing(geno),
                            reference=info.a2,
                            coded=info.a1,
                            multiallelic=info.multiallelic)

    def get_variant_by_name(self, name):
        """Get the genotype of a marker using it's name.

        Args:
            name (str): The name of the marker.

        Returns:
            list: A list of Genotypes (only one for PyPlink, see note below).

        Note
        ====
            From PyPlink version 1.3.2 and onwards, each name is unique in the
            dataset. Hence, we can use the 'get_geno_marker' function and be
            sure only one variant is returned.

        """
        # From 1.3.2 onwards, PyPlink sets unique names.
        # Getting the genotypes
        try:
            geno, i = self.bed.get_geno_marker(name, return_index=True)

        except ValueError:
            if name in self.bed.get_duplicated_markers():
                # The variant is a duplicated one, so we go through all the
                # variants with the same name and the :dupx suffix
                return [
                    self.get_variant_by_name(dup_name).pop()
                    for dup_name in self.bed.get_duplicated_markers()[name]
                ]

            else:
                # The variant is not in the BIM file, so we return an empty
                # list
                logger.warning("Variant {} was not found".format(name))
                return []

        else:
            info = self.bim.iloc[i, :]
            return [
                Genotypes(
                    Variant(info.name, CHROM_INT_TO_STR[info.chrom], info.pos,
                            [info.a1, info.a2]),
                    self._normalize_missing(geno),
                    reference=info.a2,
                    coded=info.a1,
                    multiallelic=info.multiallelic,
                )
            ]

    def get_number_samples(self):
        """Returns the number of samples.
        Returns:
            int: The number of samples.
        """
        return self.bed.get_nb_samples()

    def get_number_variants(self):
        """Returns the number of markers.
        Returns:
            int: The number of markers.
        """
        return self.bed.get_nb_markers()

    def get_samples(self):
        return list(self.fam.index)

    @staticmethod
    def _normalize_missing(g):
        """Normalize a plink genotype vector."""
        g = g.astype(float)
        g[g == -1.0] = np.nan
        return g
Exemple #18
0
def extract_markers(fn, to_extract, out_prefix, out_format, prob_t, is_long):
    """Extracts according to names.

    Args:
        fn (str): the name of the input file
        to_extract (set): the list of markers to extract for each input file
        out_prefix (str): the output prefix
        out_format (list): the output format(s)
        prob_t (float): the probability threshold
        is_long (bool): True if format needs to be long

    """
    # The output files (probabilities)
    o_files = {
        suffix: open(out_prefix + "." + suffix, "w")
        for suffix in out_format if suffix not in {"bed"}
    }

    # If there is the 'bed' format, we actually need pyplink
    if "bed" in out_format:
        o_files["bed"] = (
            PyPlink(out_prefix, "w"),
            open(out_prefix + ".bim", "w"),
        )

    # Creating a fam (if bed)
    samples = get_samples(get_file_prefix(fn) + ".sample")
    sample_names = [
        "{}/{}".format(id_1, id_2)
        for id_1, id_2 in zip(samples.ID_1, samples.ID_2)
    ]

    # Writing the header (for dosage)
    if "dosage" in o_files:
        if is_long:
            print("fid",
                  "iid",
                  "chrom",
                  "pos",
                  "name",
                  "minor",
                  "major",
                  "dosage",
                  sep="\t",
                  file=o_files["dosage"])
        else:
            print("chrom",
                  "pos",
                  "name",
                  "minor",
                  "major",
                  *sample_names,
                  sep="\t",
                  file=o_files["dosage"])

    # Writing the header (for calls)
    if "calls" in o_files:
        if is_long:
            print("fid",
                  "iid",
                  "chrom",
                  "name",
                  "cm",
                  "pos",
                  "call",
                  sep="\t",
                  file=o_files["calls"])
        else:
            print("chrom",
                  "name",
                  "cm",
                  "pos",
                  *sample_names,
                  sep="\t",
                  file=o_files["calls"])

    # Extracted positions
    all_extracted = set()

    # Reading the impute2 file
    extracted = set()

    # Finding the name of the file containing the index
    file_index = index.get_index(fn,
                                 cols=[0, 1, 2],
                                 names=["chrom", "name", "pos"],
                                 sep=" ")

    # Keeping only required values from the index
    file_index = file_index[file_index.name.isin(to_extract)]

    # Getting all the markers value
    logging.info("Extracting {:,d} markers".format(len(file_index)))
    with index.get_open_func(fn)(fn, "r") as i_file:
        for seek_value in file_index.seek.values:
            # Seeking
            i_file.seek(int(seek_value))

            # Reading the line
            line = i_file.readline()
            row = line.rstrip("\n").split(" ")

            # The marker name
            name = row[1]

            # Printing the data
            print_data(o_files,
                       prob_t,
                       samples.ID_1,
                       samples.ID_2,
                       line=line,
                       row=row,
                       is_long=is_long)

            # Saving statistics
            extracted.add(name)

    logging.info("Extracted {:,d} markers".format(len(extracted)))
    if len(to_extract - extracted) > 0:
        logging.warning("Missing {:,d} "
                        "markers".format(len(to_extract - extracted)))

    # Keeping track of what has been extracted
    all_extracted |= extracted

    # Extracting the companion files (if impute2 and files are present)
    if "impute2" in o_files:
        extract_companion_files(
            i_prefix=get_file_prefix(fn),
            to_extract=to_extract,
            o_prefix=out_prefix,
        )

    # Writing the FAM file if bed
    if "bed" in o_files:
        cols = ["ID_1", "ID_2", "father", "mother", "sex", "plink_pheno"]
        samples[cols].to_csv(out_prefix + ".fam",
                             sep=" ",
                             index=False,
                             header=False)

    # Closing the files
    for o_format, o_file in o_files.items():
        if o_format == "bed":
            o_file[0].close()
            o_file[1].close()
        else:
            o_file.close()

    # Extraction complete
    logging.info("Extraction of {:,d} markers "
                 "completed".format(len(all_extracted)))
Exemple #19
0
    def test_smaller_intersect(self):
        """Tests when the sample intersect is smaller between containers."""
        # Choosing 10 samples to exclude from the dataset
        to_exclude = np.random.choice(self.data.index, 10, replace=False)

        # Removing 5 samples from the phenotypes
        phenotypes = _DummyPhenotypes()
        phenotypes.data = self.data.drop(to_exclude[:5], axis=0)

        # Removing the next 5 for the genotypes
        plink_prefix = self.plink_prefix + "_less"
        geno_data = self.data.drop(to_exclude[5:], axis=0)
        with PyPlink(plink_prefix, "w") as bed:
            bed.write_genotypes(geno_data.snp)

        # Creating the BIM file
        with open(plink_prefix + ".bim", "w") as bim:
            print(1, "snp", 0, 1, "B", "A", sep="\t", file=bim)

        # Creating the FAM file
        with open(plink_prefix + ".fam", "w") as fam:
            for sample in geno_data.index:
                print(sample, sample, 0, 0, 0, -9, file=fam)

        # Creating the model specification
        predictors = [
            spec.genotypes.snp, spec.phenotypes.var1, spec.phenotypes.var2
        ]
        modelspec = spec.ModelSpec(
            outcome=spec.phenotypes.pheno,
            predictors=predictors,
            test="linear",
        )

        # Gathering the observed matrix
        with parsers["plink"](plink_prefix) as genotypes:
            matrix = modelspec.create_data_matrix(phenotypes, genotypes)

        # Subset of the new data
        data = self.data.drop(to_exclude, axis=0)

        # Checking the shape of the matrix
        self.assertEqual((data.shape[0], 5), matrix.shape,
                         "The observed matrix is not of the right shape")

        # Checking the intercept
        self.assertEqual([1],
                         matrix.intercept.unique().tolist(),
                         "The intercept is not as expected")

        # Checking the outcome
        outcome_col = spec.phenotypes.pheno.id
        outcomes = matrix.loc[data.index, outcome_col]
        self.assertTrue(outcomes.equals(data.pheno),
                        "The outcomes are not as expected")

        # Checking the predictors
        translations = modelspec.get_translations()
        for predictor in predictors:
            # Getting the name of the predictor
            name = translations[predictor.id]

            # Comparing the values
            np.testing.assert_array_equal(
                matrix.loc[data.index, predictor.id].values,
                data[name].values,
                err_msg="The predictor '{}' is not as expected".format(name),
            )
Exemple #20
0
def prepare_simulation(sample_size,
                       n_geno=1000,
                       seed=321,
                       exp_var=0.5,
                       n_causal=10,
                       threads=8):
    run_id = uuid.uuid4()
    run_dir = join('runs', f'run_{run_id}')
    os.makedirs(run_dir, exist_ok=True)
    print(f'creating dummy data in {run_dir}...')

    # create indiv file
    indiv_path = join(run_dir, 'indiv.txt')
    start = 10000000
    iid = pd.DataFrame([[i, i] for i in range(start, start + sample_size)],
                       columns=['FID', 'IID'])
    iid.to_csv(indiv_path, sep=' ', header=False, index=False)

    # create covariate file
    cov_path = join(run_dir, 'covariates.txt')
    age = np.random.randint(30, 81, sample_size)
    sex = np.random.randint(0, 2, sample_size)
    iid['age'] = age
    iid['sex'] = sex
    iid.to_csv(cov_path, sep=' ', header=True, index=False)

    # create toml file
    config_path = join(run_dir, 'config.toml')
    config = {
        # general parameters
        'seeds': [seed],
        'exp_vars': [exp_var],
        'n_causal': n_causal,
        'sample_sizes': [sample_size],
        'wdir': run_dir,

        # genetic parameters
        'bed': join(run_dir, 'chr%s'),
        'fam': join(run_dir, 'chr1.fam'),
        'normalize': True,
        'indiv': indiv_path,
        'mid_buffer': 0,

        # GAN parameters
        'stylegan2_models': '../models/',
        'stylegan2_name': 'stylegan2_healthy',
        'psi': 0.4,
        'diff_noise': False,
        'mult_scale': 2.0,

        # feature condensation parameters
        'n_pcs': 100,
        'size': 448,
        'tfms': 'tta',
        'model': 'resnet50',
        'pretraining': 'imagenet',
        'spatial': 'mean',
        'layers': ['L4'],

        # GWAS parameters
        'first_pc': 0,
        'last_pc': 9,
        'cov': cov_path,
        'cov_columns': ['sex'],
        'qcov_columns': ['age'],
        'threads': threads,
        'bolt': '../BOLT-LMM_v2.3.4/',
        'ref_map': "",
        'ldscores': "",
    }
    toml.dump(config, open(config_path, 'w'))

    # create genetic data
    iid['father'] = 0
    iid['mother'] = 0
    iid['sex'] += 1
    iid['pheno'] = -9
    fam_df = iid[['FID', 'IID', 'father', 'mother', 'sex', 'pheno']]

    ld_df = pd.read_csv(join(BOLT_DIR, 'tables', 'LDSCORE.1000G_EUR.tab.gz'),
                        sep='\t')
    ld_df = ld_df.sample(n_geno).sort_values(['CHR', 'BP'])
    G = (np.random.rand(n_geno, sample_size, 2) > ld_df.MAF.values.reshape(
        -1, 1, 1)).sum(-1)

    last_ind = 0
    for chromo, chromo_df in tqdm(ld_df.groupby('CHR')):
        gen_path = join(run_dir, 'chr%d' % chromo)

        n_chromo = len(chromo_df)
        chromo_G = G[last_ind:(last_ind + n_chromo)]
        last_ind += n_chromo

        with PyPlink(gen_path, 'w') as bed:
            for g in chromo_G:
                bed.write_genotypes(g)

        fam_df.to_csv(gen_path + '.fam', sep='\t', header=False, index=False)

        bim_df = pd.DataFrame(
            np.array([
                n_chromo * [chromo],
                chromo_df.SNP.values,
                n_chromo * [0],
                chromo_df.BP.values,
                n_chromo * ['C'],
                n_chromo * ['A'],
            ]).T)
        bim_df.to_csv(gen_path + '.bim', sep='\t', header=False, index=False)
Exemple #21
0
class ReadPlink(object):

    """Reads plink files and allows random sampling"""

    def __init__(self, plinkstem):
        """
        :param plinkstem: plink stem file path

        """
        self._plinkstem = plinkstem
        self._bim_path = os.path.basename(self._plinkstem)+'.bim'
        self._bed_path = os.path.basename(self._plinkstem)+'.bed'
        self._fam_path = os.path.basename(self._plinkstem)+'.fam'

        self.plinkfile = PyPlink(self._plinkstem)
        self.fam = self.plinkfile.get_fam()
        self.bim = self.plinkfile.get_bim()
        self.N = self.fam.shape[0]
        self.P = self.bim.shape[0]
        self._sample_subjects = None
        self._sample_variants = None


    def sample(self, n, p, write_disk=False):
        """Samples from a plink file with random SNPs and subjects
        Currently pandas_plink does not support fancy indexing, hence
        sample will load the genotypes of all subjects before randomly sample
        subjects IDs.

        :param n: number of subjects to sample
        :param p: number of variants to sample
        :param write_disk: bool, write to disk a list of variants
        :returns: a numpy matrix of size n*p

        """
        self._sample_subjects = np.random.choice(self.fam.index.values, n, replace=True)
        self._sample_variants = np.random.choice(self.bim.index.values, p)

        if write_disk:
            self.bim.iloc[self._sample_variants].to_csv('sampled_variants.csv')
            self.fam.iloc[self._sample_subjects].to_csv('sampled_subjects.csv')

        genotypematrix = self.read_bed(self._sample_variants,
                                        self._sample_subjects)

        return genotypematrix

    def read_bed(self, marker=None, subjects=None):
        """read bed file

        :param marker: list of SNPs
        :param subjects: list of subjects
        :returns: genotypematrix of size subjects*marker

        """
        if marker is None:
            P_size = self.P
            marker = self.bim.index.values
        else:
            P_size = len(marker)

        if subjects is None:
            N_size = self.N
            subjects = self.fam.index.values
        else:
            N_size = len(subjects)

        genotypematrix = np.zeros((N_size, P_size), dtype=np.int8)

        j = 0
        for m, g in self.plinkfile.iter_geno_marker(marker):
            genotypematrix[:,j] = g[subjects]
            j += 1

        genotypematrix[genotypematrix < 0] = 0

        return genotypematrix
args = parser.parse_args()

top_variant = None
top_p = 1.0
with open(args.assoc, 'rb') as ihandle:
    rdr = csv.DictReader(ihandle, delimiter='\t')
    for row in rdr:
        p = float(row['p_lrt'])
        if p < top_p:
            top_p = p
            top_variant = row['rs']
if top_variant is None:
    raise Exception('No top variant chosen')
print top_variant

p = PyPlink(args.plink)
g = None
for marker, genotypes in p:
    if marker == top_variant:
        s = sum(x for x in genotypes.tolist() if x != -1)
        c = sum(1. for x in genotypes.tolist() if x != -1)
        meang = s / c
        g = [str(x) if x != -1 else str(meang) for x in genotypes]
        break
if g is None:
    raise Exception('Could not find top variant in plink file')

if args.previous is None or args.previous == 'NONE':
    previous = [['1'] for x in g]
else:
    previous = []
    bootstrapped_list = [
        random.randint(0, number_markers - 1) for _ in range(number_markers)
    ]
    bootstrapped_list.sort()
    return bootstrapped_list


### parsing command line arguments
parser = argparse.ArgumentParser()
parser.add_argument('bfile',
                    help='plink binary (bed, bim, fam) dataset prefix')
parser.add_argument('nboot', help='number of bootstrap replicates', type=int)
args = parser.parse_args()

### perform plink bootstrap replicates
with PyPlink(args.bfile) as bed:
    bim = bed.get_bim()  # returns pandas.DataFrame of bim file

    nmarkers = bed.get_nb_markers()
    nsamples = bed.get_nb_samples()
    print("### Loaded {0} markers and {1} samples...".format(
        nmarkers, nsamples))

    for rep in range(args.nboot):
        print("### Performing bootstrap replicate {0} of {1}...".format(
            rep + 1, args.nboot))
        rep_list = get_resampled_loci(
            nmarkers)  # gets a list of resampled markers

        rep_basename = "rep" + str(rep)
        with PyPlink(rep_basename,
Exemple #24
0
    MAX_MAF = max(mafs) if mafs else 1
    MAX_MAC = max(macs) if macs else 9000000000
    OUT_QUALIFIED_VARIANTS = True if args['--ov'] else False

    if len(mafs) == 0 and len(macs) == 0:
        sys.stderr.write(
            "At least one should be open: '--alt-frqs' and/or '--alt-acs'\n")
        sys.exit(-1)

    # Open and work on VCF file.
    # https://lemieuxl.github.io/pyplink/
    # API: https://lemieuxl.github.io/pyplink/pyplink.html
    # API demo: https://nbviewer.jupyter.org/github/lemieuxl/pyplink/blob/master/demo/PyPlink%20Demo.ipynb
    from pyplink import PyPlink
    # Getting the BED BIM FAM
    bed = PyPlink(plink_prefix)
    bim = bed.get_bim()
    fam = bed.get_fam()
    dup = bed.get_duplicated_markers()
    if dup:
        sys.stderr.write('ERROR: Duplicate markers found as above!!!\n')
        sys.exit(-1)

    snp_sets = set(bim.index)
    select_samples = [x in samples for x in fam.loc[:, 'iid']
                      ]  # True/false array for sample we want to keep.

    # print(vcf.samples)
    out = "#CHROM  BEGIN   END     MARKER_ID       NUM_ALL_VARS    NUM_PASS_VARS   NUM_SING_VARS MAF/MAC_CUT".split(
    )
    sys.stdout.write(