def save_snps(self, filename=None): """ Save SNPs to file. Parameters ---------- filename : str filename for file to save Returns ------- str path to file in output directory if SNPs were saved, else empty str """ comment = ("# Source(s): {}\n" "# Assembly: {}\n" "# SNPs: {}\n" "# Chromosomes: {}\n".format(self.source, self.assembly, self.snp_count, self.chromosomes_summary)) if filename is None: filename = (self.get_var_repr(self._source) + "_lineage_" + self.assembly + ".csv") return save_df_as_csv( self._snps, self._output_dir, filename, comment=comment, header=["chromosome", "position", "genotype"], )
def _save_discrepant_snps_file(self, df, name, filename): if filename is None: filename = self.get_var_repr(self._name) + "_" + name + ".csv" return save_df_as_csv( df, self._output_dir, filename, comment="# Source(s): {}\n".format(self.source), )
def _find_shared_dna_output_helper( self, individual1, individual2, one_chrom_shared_dna, two_chrom_shared_dna, one_chrom_shared_genes, two_chrom_shared_genes, ): cytobands = self._resources.get_cytoBand_hg19() if create_dir(self._output_dir): plot_chromosomes( one_chrom_shared_dna, two_chrom_shared_dna, cytobands, os.path.join( self._output_dir, "shared_dna_{}_{}.png".format(individual1.get_var_name(), individual2.get_var_name()), ), "{} / {} shared DNA".format(individual1.name, individual2.name), 37, ) if len(one_chrom_shared_dna) > 0: file = "shared_dna_one_chrom_{}_{}_GRCh37.csv".format( individual1.get_var_name(), individual2.get_var_name()) save_df_as_csv(one_chrom_shared_dna, self._output_dir, file, float_format="%.2f") if len(two_chrom_shared_dna) > 0: file = "shared_dna_two_chroms_{}_{}_GRCh37.csv".format( individual1.get_var_name(), individual2.get_var_name()) save_df_as_csv(two_chrom_shared_dna, self._output_dir, file, float_format="%.2f") if len(one_chrom_shared_genes) > 0: file = "shared_genes_one_chrom_{}_{}_GRCh37.csv".format( individual1.get_var_name(), individual2.get_var_name()) save_df_as_csv(one_chrom_shared_genes, self._output_dir, file) if len(two_chrom_shared_genes) > 0: file = "shared_genes_two_chroms_{}_{}_GRCh37.csv".format( individual1.get_var_name(), individual2.get_var_name()) save_df_as_csv(two_chrom_shared_genes, self._output_dir, file)
def find_discordant_snps(self, individual1, individual2, individual3=None, save_output=False): """ Find discordant SNPs between two or three individuals. Parameters ---------- individual1 : Individual reference individual (child if `individual2` and `individual3` are parents) individual2 : Individual comparison individual individual3 : Individual other parent if `individual1` is child and `individual2` is a parent save_output : bool specifies whether to save output to a CSV file in the output directory Returns ------- pandas.DataFrame discordant SNPs and associated genetic data References ---------- ..[1] David Pike, "Search for Discordant SNPs in Parent-Child Raw Data Files," David Pike's Utilities, http://www.math.mun.ca/~dapike/FF23utils/pair-discord.php ..[2] David Pike, "Search for Discordant SNPs when given data for child and both parents," David Pike's Utilities, http://www.math.mun.ca/~dapike/FF23utils/trio-discord.php """ self._remap_snps_to_GRCh37([individual1, individual2, individual3]) df = individual1.snps # remove nulls for reference individual df = df.loc[df["genotype"].notnull()] # add SNPs shared with `individual2` df = df.join(individual2.snps["genotype"], rsuffix="2") genotype1 = "genotype_" + individual1.get_var_name() genotype2 = "genotype_" + individual2.get_var_name() if individual3 is None: df = df.rename(columns={ "genotype": genotype1, "genotype2": genotype2 }) # find discordant SNPs between reference and comparison individuals df = df.loc[df[genotype2].notnull() & ((df[genotype1].str.len() == 1) & (df[genotype2].str.len() == 1) & (df[genotype1] != df[genotype2])) | ((df[genotype1].str.len() == 2) & (df[genotype2].str.len() == 2) & (df[genotype1].str[0] != df[genotype2].str[0]) & (df[genotype1].str[0] != df[genotype2].str[1]) & (df[genotype1].str[1] != df[genotype2].str[0]) & (df[genotype1].str[1] != df[genotype2].str[1]))] if save_output: save_df_as_csv( df, self._output_dir, "discordant_snps_{}_{}_GRCh37.csv".format( individual1.get_var_name(), individual2.get_var_name()), ) else: # add SNPs shared with `individual3` df = df.join(individual3.snps["genotype"], rsuffix="3") genotype3 = "genotype_" + individual3.get_var_name() df = df.rename( columns={ "genotype": genotype1, "genotype2": genotype2, "genotype3": genotype3, }) # find discordant SNPs between child and two parents df = df.loc[(df[genotype2].notnull() & ((df[genotype1].str.len() == 1) & (df[genotype2].str.len() == 1) & (df[genotype1] != df[genotype2])) | ((df[genotype1].str.len() == 2) & (df[genotype2].str.len() == 2) & (df[genotype1].str[0] != df[genotype2].str[0]) & (df[genotype1].str[0] != df[genotype2].str[1]) & (df[genotype1].str[1] != df[genotype2].str[0]) & (df[genotype1].str[1] != df[genotype2].str[1]))) | (df[genotype3].notnull() & ((df[genotype1].str.len() == 1) & (df[genotype3].str.len() == 1) & (df[genotype1] != df[genotype3])) | ((df[genotype1].str.len() == 2) & (df[genotype3].str.len() == 2) & (df[genotype1].str[0] != df[genotype3].str[0]) & (df[genotype1].str[0] != df[genotype3].str[1]) & (df[genotype1].str[1] != df[genotype3].str[0]) & (df[genotype1].str[1] != df[genotype3].str[1]))) | (df[genotype2].notnull() & df[genotype3].notnull() & (df[genotype2].str.len() == 2) & (df[genotype2].str[0] == df[genotype2].str[1]) & (df[genotype2] == df[genotype3]) & (df[genotype1] != df[genotype2]))] if save_output: save_df_as_csv( df, self._output_dir, "discordant_snps_{}_{}_{}_GRCh37.csv".format( individual1.get_var_name(), individual2.get_var_name(), individual3.get_var_name(), ), ) return df
def _add_snps( self, snps, discrepant_snp_positions_threshold, discrepant_genotypes_threshold, save_output, ): """ Add SNPs to this ``SNPsCollection``. Parameters ---------- snps : SNPs SNPs to add discrepant_snp_positions_threshold : int see above discrepant_genotypes_threshold : int see above save_output see above Returns ------- discrepant_positions : pandas.DataFrame discrepant_genotypes : pandas.DataFrame """ discrepant_positions = pd.DataFrame() discrepant_genotypes = pd.DataFrame() if snps._snps is None: return discrepant_positions, discrepant_genotypes build = snps._build source = [s.strip() for s in snps._source.split(",")] if not snps._build_detected: print("build not detected, assuming build {}".format(snps._build)) if self._build is None: self._build = build elif self._build != build: print( "build / assembly mismatch between current build of SNPs and SNPs being loaded" ) # ensure there area always two X alleles snps = self._double_single_alleles(snps._snps, "X") if self._snps is None: self._source.extend(source) self._snps = snps else: common_snps = self._snps.join(snps, how="inner", rsuffix="_added") discrepant_positions = common_snps.loc[ (common_snps["chrom"] != common_snps["chrom_added"]) | (common_snps["pos"] != common_snps["pos_added"])] if 0 < len( discrepant_positions) < discrepant_snp_positions_threshold: print( str(len(discrepant_positions)) + " SNP positions were discrepant; " "keeping original positions") if save_output: self._discrepant_positions_file_count += 1 save_df_as_csv( discrepant_positions, self._output_dir, self.get_var_repr(self._name) + "_discrepant_positions_" + str(self._discrepant_positions_file_count) + ".csv", ) elif len(discrepant_positions ) >= discrepant_snp_positions_threshold: print( "too many SNPs differ in position; ensure same genome build is being used" ) return discrepant_positions, discrepant_genotypes # remove null genotypes common_snps = common_snps.loc[ ~common_snps["genotype"].isnull() & ~common_snps["genotype_added"].isnull()] # discrepant genotypes are where alleles are not equivalent (i.e., alleles are not the # same and not swapped) discrepant_genotypes = common_snps.loc[ ((common_snps["genotype"].str.len() == 1) & (common_snps["genotype_added"].str.len() == 1) & ~(common_snps["genotype"].str[0] == common_snps["genotype_added"].str[0])) | ((common_snps["genotype"].str.len() == 2) & (common_snps["genotype_added"].str.len() == 2) & ~((common_snps["genotype"].str[0] == common_snps["genotype_added"].str[0]) & (common_snps["genotype"].str[1] == common_snps["genotype_added"].str[1])) & ~((common_snps["genotype"].str[0] == common_snps["genotype_added"].str[1]) & (common_snps["genotype"].str[1] == common_snps["genotype_added"].str[0])))] if 0 < len(discrepant_genotypes) < discrepant_genotypes_threshold: print( str(len(discrepant_genotypes)) + " SNP genotypes were discrepant; " "marking those as null") if save_output: self._discrepant_genotypes_file_count += 1 save_df_as_csv( discrepant_genotypes, self._output_dir, self.get_var_repr(self._name) + "_discrepant_genotypes_" + str(self._discrepant_genotypes_file_count) + ".csv", ) elif len(discrepant_genotypes) >= discrepant_genotypes_threshold: print( "too many SNPs differ in their genotype; ensure file is for same " "individual") return discrepant_positions, discrepant_genotypes # add new SNPs self._source.extend(source) self._snps = self._snps.combine_first(snps) self._snps.loc[discrepant_genotypes.index, "genotype"] = np.nan # combine_first converts position to float64, so convert it back to int64 self._snps["pos"] = self._snps["pos"].astype(np.int64) self.sort_snps() return discrepant_positions, discrepant_genotypes