Beispiel #1
0
    def save_snps(self, filename=None):
        """ Save SNPs to file.

        Parameters
        ----------
        filename : str
            filename for file to save

        Returns
        -------
        str
            path to file in output directory if SNPs were saved, else empty str
        """
        comment = ("# Source(s): {}\n"
                   "# Assembly: {}\n"
                   "# SNPs: {}\n"
                   "# Chromosomes: {}\n".format(self.source, self.assembly,
                                                self.snp_count,
                                                self.chromosomes_summary))

        if filename is None:
            filename = (self.get_var_repr(self._source) + "_lineage_" +
                        self.assembly + ".csv")

        return save_df_as_csv(
            self._snps,
            self._output_dir,
            filename,
            comment=comment,
            header=["chromosome", "position", "genotype"],
        )
Beispiel #2
0
    def _save_discrepant_snps_file(self, df, name, filename):
        if filename is None:
            filename = self.get_var_repr(self._name) + "_" + name + ".csv"

        return save_df_as_csv(
            df,
            self._output_dir,
            filename,
            comment="# Source(s): {}\n".format(self.source),
        )
Beispiel #3
0
    def _find_shared_dna_output_helper(
        self,
        individual1,
        individual2,
        one_chrom_shared_dna,
        two_chrom_shared_dna,
        one_chrom_shared_genes,
        two_chrom_shared_genes,
    ):
        cytobands = self._resources.get_cytoBand_hg19()

        if create_dir(self._output_dir):
            plot_chromosomes(
                one_chrom_shared_dna,
                two_chrom_shared_dna,
                cytobands,
                os.path.join(
                    self._output_dir,
                    "shared_dna_{}_{}.png".format(individual1.get_var_name(),
                                                  individual2.get_var_name()),
                ),
                "{} / {} shared DNA".format(individual1.name,
                                            individual2.name),
                37,
            )

        if len(one_chrom_shared_dna) > 0:
            file = "shared_dna_one_chrom_{}_{}_GRCh37.csv".format(
                individual1.get_var_name(), individual2.get_var_name())
            save_df_as_csv(one_chrom_shared_dna,
                           self._output_dir,
                           file,
                           float_format="%.2f")

        if len(two_chrom_shared_dna) > 0:
            file = "shared_dna_two_chroms_{}_{}_GRCh37.csv".format(
                individual1.get_var_name(), individual2.get_var_name())
            save_df_as_csv(two_chrom_shared_dna,
                           self._output_dir,
                           file,
                           float_format="%.2f")

        if len(one_chrom_shared_genes) > 0:
            file = "shared_genes_one_chrom_{}_{}_GRCh37.csv".format(
                individual1.get_var_name(), individual2.get_var_name())
            save_df_as_csv(one_chrom_shared_genes, self._output_dir, file)

        if len(two_chrom_shared_genes) > 0:
            file = "shared_genes_two_chroms_{}_{}_GRCh37.csv".format(
                individual1.get_var_name(), individual2.get_var_name())
            save_df_as_csv(two_chrom_shared_genes, self._output_dir, file)
Beispiel #4
0
    def find_discordant_snps(self,
                             individual1,
                             individual2,
                             individual3=None,
                             save_output=False):
        """ Find discordant SNPs between two or three individuals.

        Parameters
        ----------
        individual1 : Individual
            reference individual (child if `individual2` and `individual3` are parents)
        individual2 : Individual
            comparison individual
        individual3 : Individual
            other parent if `individual1` is child and `individual2` is a parent
        save_output : bool
            specifies whether to save output to a CSV file in the output directory

        Returns
        -------
        pandas.DataFrame
            discordant SNPs and associated genetic data

        References
        ----------
        ..[1] David Pike, "Search for Discordant SNPs in Parent-Child
          Raw Data Files," David Pike's Utilities,
          http://www.math.mun.ca/~dapike/FF23utils/pair-discord.php
        ..[2] David Pike, "Search for Discordant SNPs when given data
          for child and both parents," David Pike's Utilities,
          http://www.math.mun.ca/~dapike/FF23utils/trio-discord.php
        """
        self._remap_snps_to_GRCh37([individual1, individual2, individual3])

        df = individual1.snps

        # remove nulls for reference individual
        df = df.loc[df["genotype"].notnull()]

        # add SNPs shared with `individual2`
        df = df.join(individual2.snps["genotype"], rsuffix="2")

        genotype1 = "genotype_" + individual1.get_var_name()
        genotype2 = "genotype_" + individual2.get_var_name()

        if individual3 is None:
            df = df.rename(columns={
                "genotype": genotype1,
                "genotype2": genotype2
            })

            # find discordant SNPs between reference and comparison individuals
            df = df.loc[df[genotype2].notnull()
                        & ((df[genotype1].str.len() == 1)
                           & (df[genotype2].str.len() == 1)
                           & (df[genotype1] != df[genotype2]))
                        | ((df[genotype1].str.len() == 2)
                           & (df[genotype2].str.len() == 2)
                           & (df[genotype1].str[0] != df[genotype2].str[0])
                           & (df[genotype1].str[0] != df[genotype2].str[1])
                           & (df[genotype1].str[1] != df[genotype2].str[0])
                           & (df[genotype1].str[1] != df[genotype2].str[1]))]
            if save_output:
                save_df_as_csv(
                    df,
                    self._output_dir,
                    "discordant_snps_{}_{}_GRCh37.csv".format(
                        individual1.get_var_name(),
                        individual2.get_var_name()),
                )
        else:
            # add SNPs shared with `individual3`
            df = df.join(individual3.snps["genotype"], rsuffix="3")

            genotype3 = "genotype_" + individual3.get_var_name()

            df = df.rename(
                columns={
                    "genotype": genotype1,
                    "genotype2": genotype2,
                    "genotype3": genotype3,
                })

            # find discordant SNPs between child and two parents
            df = df.loc[(df[genotype2].notnull()
                         & ((df[genotype1].str.len() == 1)
                            & (df[genotype2].str.len() == 1)
                            & (df[genotype1] != df[genotype2]))
                         | ((df[genotype1].str.len() == 2)
                            & (df[genotype2].str.len() == 2)
                            & (df[genotype1].str[0] != df[genotype2].str[0])
                            & (df[genotype1].str[0] != df[genotype2].str[1])
                            & (df[genotype1].str[1] != df[genotype2].str[0])
                            & (df[genotype1].str[1] != df[genotype2].str[1])))
                        | (df[genotype3].notnull()
                           & ((df[genotype1].str.len() == 1)
                              & (df[genotype3].str.len() == 1)
                              & (df[genotype1] != df[genotype3]))
                           | ((df[genotype1].str.len() == 2)
                              & (df[genotype3].str.len() == 2)
                              & (df[genotype1].str[0] != df[genotype3].str[0])
                              & (df[genotype1].str[0] != df[genotype3].str[1])
                              & (df[genotype1].str[1] != df[genotype3].str[0])
                              &
                              (df[genotype1].str[1] != df[genotype3].str[1])))
                        | (df[genotype2].notnull()
                           & df[genotype3].notnull()
                           & (df[genotype2].str.len() == 2)
                           & (df[genotype2].str[0] == df[genotype2].str[1])
                           & (df[genotype2] == df[genotype3])
                           & (df[genotype1] != df[genotype2]))]

            if save_output:
                save_df_as_csv(
                    df,
                    self._output_dir,
                    "discordant_snps_{}_{}_{}_GRCh37.csv".format(
                        individual1.get_var_name(),
                        individual2.get_var_name(),
                        individual3.get_var_name(),
                    ),
                )

        return df
Beispiel #5
0
    def _add_snps(
        self,
        snps,
        discrepant_snp_positions_threshold,
        discrepant_genotypes_threshold,
        save_output,
    ):
        """ Add SNPs to this ``SNPsCollection``.

        Parameters
        ----------
        snps : SNPs
            SNPs to add
        discrepant_snp_positions_threshold : int
            see above
        discrepant_genotypes_threshold : int
            see above
        save_output
            see above

        Returns
        -------
        discrepant_positions : pandas.DataFrame
        discrepant_genotypes : pandas.DataFrame
        """
        discrepant_positions = pd.DataFrame()
        discrepant_genotypes = pd.DataFrame()

        if snps._snps is None:
            return discrepant_positions, discrepant_genotypes

        build = snps._build
        source = [s.strip() for s in snps._source.split(",")]

        if not snps._build_detected:
            print("build not detected, assuming build {}".format(snps._build))

        if self._build is None:
            self._build = build
        elif self._build != build:
            print(
                "build / assembly mismatch between current build of SNPs and SNPs being loaded"
            )

        # ensure there area always two X alleles
        snps = self._double_single_alleles(snps._snps, "X")

        if self._snps is None:
            self._source.extend(source)
            self._snps = snps
        else:
            common_snps = self._snps.join(snps, how="inner", rsuffix="_added")

            discrepant_positions = common_snps.loc[
                (common_snps["chrom"] != common_snps["chrom_added"])
                | (common_snps["pos"] != common_snps["pos_added"])]

            if 0 < len(
                    discrepant_positions) < discrepant_snp_positions_threshold:
                print(
                    str(len(discrepant_positions)) +
                    " SNP positions were discrepant; "
                    "keeping original positions")

                if save_output:
                    self._discrepant_positions_file_count += 1
                    save_df_as_csv(
                        discrepant_positions,
                        self._output_dir,
                        self.get_var_repr(self._name) +
                        "_discrepant_positions_" +
                        str(self._discrepant_positions_file_count) + ".csv",
                    )
            elif len(discrepant_positions
                     ) >= discrepant_snp_positions_threshold:
                print(
                    "too many SNPs differ in position; ensure same genome build is being used"
                )
                return discrepant_positions, discrepant_genotypes

            # remove null genotypes
            common_snps = common_snps.loc[
                ~common_snps["genotype"].isnull()
                & ~common_snps["genotype_added"].isnull()]

            # discrepant genotypes are where alleles are not equivalent (i.e., alleles are not the
            # same and not swapped)
            discrepant_genotypes = common_snps.loc[
                ((common_snps["genotype"].str.len() == 1)
                 & (common_snps["genotype_added"].str.len() == 1)
                 & ~(common_snps["genotype"].str[0] ==
                     common_snps["genotype_added"].str[0]))
                | ((common_snps["genotype"].str.len() == 2)
                   & (common_snps["genotype_added"].str.len() == 2)
                   & ~((common_snps["genotype"].str[0] ==
                        common_snps["genotype_added"].str[0])
                       & (common_snps["genotype"].str[1] ==
                          common_snps["genotype_added"].str[1]))
                   & ~((common_snps["genotype"].str[0] ==
                        common_snps["genotype_added"].str[1])
                       & (common_snps["genotype"].str[1] ==
                          common_snps["genotype_added"].str[0])))]

            if 0 < len(discrepant_genotypes) < discrepant_genotypes_threshold:
                print(
                    str(len(discrepant_genotypes)) +
                    " SNP genotypes were discrepant; "
                    "marking those as null")

                if save_output:
                    self._discrepant_genotypes_file_count += 1
                    save_df_as_csv(
                        discrepant_genotypes,
                        self._output_dir,
                        self.get_var_repr(self._name) +
                        "_discrepant_genotypes_" +
                        str(self._discrepant_genotypes_file_count) + ".csv",
                    )
            elif len(discrepant_genotypes) >= discrepant_genotypes_threshold:
                print(
                    "too many SNPs differ in their genotype; ensure file is for same "
                    "individual")
                return discrepant_positions, discrepant_genotypes

            # add new SNPs
            self._source.extend(source)
            self._snps = self._snps.combine_first(snps)
            self._snps.loc[discrepant_genotypes.index, "genotype"] = np.nan

            # combine_first converts position to float64, so convert it back to int64
            self._snps["pos"] = self._snps["pos"].astype(np.int64)

        self.sort_snps()

        return discrepant_positions, discrepant_genotypes