def _write_csv(self): """ Write SNPs to a CSV file. Returns ------- str path to file in output directory if SNPs were saved, else empty str """ filename = self._filename if not filename: ext = ".txt" if "sep" in self._kwargs and self._kwargs["sep"] == ",": ext = ".csv" filename = "{}_{}{}".format(clean_str(self._snps.source), self._snps.assembly, ext) comment = ("# Source(s): {}\n" "# Build: {}\n" "# Build Detected: {}\n" "# Phased: {}\n" "# SNPs: {}\n" "# Chromosomes: {}\n".format( self._snps.source, self._snps.build, self._snps.build_detected, self._snps.phased, self._snps.count, self._snps.chromosomes_summary, )) if "header" in self._kwargs: if isinstance(self._kwargs["header"], bool): if self._kwargs["header"]: self._kwargs["header"] = [ "chromosome", "position", "genotype" ] else: self._kwargs["header"] = ["chromosome", "position", "genotype"] return save_df_as_csv(self._snps._snps, self._snps._output_dir, filename, comment=comment, atomic=self._atomic, **self._kwargs)
def _write_vcf(self): """ Write SNPs to a VCF file. References ---------- 1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019, https://samtools.github.io/hts-specs/VCFv4.2.pdf Returns ------- str path to file in output directory if SNPs were saved, else empty str discrepant_vcf_position : pd.DataFrame SNPs with discrepant positions discovered while saving VCF """ filename = self._filename if not filename: filename = f"{clean_str(self._snps.source)}_{self._snps.assembly}{'.vcf'}" comment = ( f"##fileformat=VCFv4.2\n" f'##fileDate={datetime.datetime.utcnow().strftime("%Y%m%d")}\n' f'##source="{self._snps.source}; snps v{snps.__version__}; https://pypi.org/project/snps/"\n' ) reference_sequence_chroms = ( "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT", ) df = self._snps.snps tasks = [] # skip insertions and deletions df = df.drop(df.loc[df["genotype"].notnull() & ((df["genotype"].str[0] == "I") | (df["genotype"].str[0] == "D") | (df["genotype"].str[1] == "I") | (df["genotype"].str[1] == "D"))].index) chroms_to_drop = [] for chrom in df["chrom"].unique(): if chrom not in reference_sequence_chroms: chroms_to_drop.append(chrom) continue tasks.append({ "resources": self._snps._resources, "assembly": self._snps.assembly, "chrom": chrom, "snps": pd.DataFrame(df.loc[(df["chrom"] == chrom)]), }) # drop chromosomes without reference sequence data (e.g., unassigned PAR) for chrom in chroms_to_drop: df = df.drop(df.loc[df["chrom"] == chrom].index) # create the VCF representation for SNPs results = map(self._create_vcf_representation, tasks) contigs = [] vcf = [pd.DataFrame()] discrepant_vcf_position = [pd.DataFrame()] for result in list(results): contigs.append(result["contig"]) vcf.append(result["vcf"]) discrepant_vcf_position.append(result["discrepant_vcf_position"]) vcf = pd.concat(vcf) discrepant_vcf_position = pd.concat(discrepant_vcf_position) comment += "".join(contigs) comment += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n' comment += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n" return ( save_df_as_csv( vcf, self._snps._output_dir, filename, comment=comment, prepend_info=False, header=False, index=False, na_rep=".", sep="\t", ), discrepant_vcf_position, )
def _find_shared_dna_output_helper( self, individuals, one_chrom_shared_dna, two_chrom_shared_dna, one_chrom_shared_genes, two_chrom_shared_genes, ): cytobands = self._resources.get_cytoBand_hg19() individuals_filename = "" individuals_plot_title = "" for individual in individuals: individuals_filename += individual.get_var_name() + "_" individuals_plot_title += individual.name + " / " individuals_filename = individuals_filename[:-1] individuals_plot_title = individuals_plot_title[:-3] if create_dir(self._output_dir): plot_chromosomes( one_chrom_shared_dna, two_chrom_shared_dna, cytobands, os.path.join( self._output_dir, "shared_dna_{}.png".format(individuals_filename) ), "{} shared DNA".format(individuals_plot_title), 37, ) if len(one_chrom_shared_dna) > 0: file = "shared_dna_one_chrom_{}_GRCh37.csv".format(individuals_filename) save_df_as_csv( one_chrom_shared_dna, self._output_dir, file, comment=self._get_csv_header(), prepend_info=False, float_format="%.2f", ) if len(two_chrom_shared_dna) > 0: file = "shared_dna_two_chroms_{}_GRCh37.csv".format(individuals_filename) save_df_as_csv( two_chrom_shared_dna, self._output_dir, file, comment=self._get_csv_header(), prepend_info=False, float_format="%.2f", ) if len(one_chrom_shared_genes) > 0: file = "shared_genes_one_chrom_{}_GRCh37.csv".format(individuals_filename) save_df_as_csv( one_chrom_shared_genes, self._output_dir, file, comment=self._get_csv_header(), prepend_info=False, ) if len(two_chrom_shared_genes) > 0: file = "shared_genes_two_chroms_{}_GRCh37.csv".format(individuals_filename) save_df_as_csv( two_chrom_shared_genes, self._output_dir, file, comment=self._get_csv_header(), prepend_info=False, )
def find_discordant_snps( self, individual1, individual2, individual3=None, save_output=False ): """ Find discordant SNPs between two or three individuals. Parameters ---------- individual1 : Individual reference individual (child if `individual2` and `individual3` are parents) individual2 : Individual comparison individual individual3 : Individual other parent if `individual1` is child and `individual2` is a parent save_output : bool specifies whether to save output to a CSV file in the output directory Returns ------- pandas.DataFrame discordant SNPs and associated genetic data References ---------- 1. David Pike, "Search for Discordant SNPs in Parent-Child Raw Data Files," David Pike's Utilities, http://www.math.mun.ca/~dapike/FF23utils/pair-discord.php 2. David Pike, "Search for Discordant SNPs when given data for child and both parents," David Pike's Utilities, http://www.math.mun.ca/~dapike/FF23utils/trio-discord.php """ self._remap_snps_to_GRCh37([individual1, individual2, individual3]) df = individual1.snps # remove nulls for reference individual df = df.loc[df["genotype"].notnull()] # add SNPs shared with `individual2` df = df.join(individual2.snps["genotype"], rsuffix="2") genotype1 = "genotype_" + individual1.get_var_name() genotype2 = "genotype_" + individual2.get_var_name() if individual3 is None: df = df.rename(columns={"genotype": genotype1, "genotype2": genotype2}) # find discordant SNPs between reference and comparison individuals df = df.loc[ df[genotype2].notnull() & ( (df[genotype1].str.len() == 1) & (df[genotype2].str.len() == 1) & (df[genotype1] != df[genotype2]) ) | ( (df[genotype1].str.len() == 2) & (df[genotype2].str.len() == 2) & (df[genotype1].str[0] != df[genotype2].str[0]) & (df[genotype1].str[0] != df[genotype2].str[1]) & (df[genotype1].str[1] != df[genotype2].str[0]) & (df[genotype1].str[1] != df[genotype2].str[1]) ) ] if save_output: save_df_as_csv( df, self._output_dir, "discordant_snps_{}_{}_GRCh37.csv".format( individual1.get_var_name(), individual2.get_var_name() ), comment=self._get_csv_header(), prepend_info=False, ) else: # add SNPs shared with `individual3` df = df.join(individual3.snps["genotype"], rsuffix="3") genotype3 = "genotype_" + individual3.get_var_name() df = df.rename( columns={ "genotype": genotype1, "genotype2": genotype2, "genotype3": genotype3, } ) # find discordant SNPs between child and two parents df = df.loc[ ( df[genotype2].notnull() & ( (df[genotype1].str.len() == 1) & (df[genotype2].str.len() == 1) & (df[genotype1] != df[genotype2]) ) | ( (df[genotype1].str.len() == 2) & (df[genotype2].str.len() == 2) & (df[genotype1].str[0] != df[genotype2].str[0]) & (df[genotype1].str[0] != df[genotype2].str[1]) & (df[genotype1].str[1] != df[genotype2].str[0]) & (df[genotype1].str[1] != df[genotype2].str[1]) ) ) | ( df[genotype3].notnull() & ( (df[genotype1].str.len() == 1) & (df[genotype3].str.len() == 1) & (df[genotype1] != df[genotype3]) ) | ( (df[genotype1].str.len() == 2) & (df[genotype3].str.len() == 2) & (df[genotype1].str[0] != df[genotype3].str[0]) & (df[genotype1].str[0] != df[genotype3].str[1]) & (df[genotype1].str[1] != df[genotype3].str[0]) & (df[genotype1].str[1] != df[genotype3].str[1]) ) ) | ( df[genotype2].notnull() & df[genotype3].notnull() & (df[genotype2].str.len() == 2) & (df[genotype2].str[0] == df[genotype2].str[1]) & (df[genotype2] == df[genotype3]) & (df[genotype1] != df[genotype2]) ) ] if save_output: save_df_as_csv( df, self._output_dir, "discordant_snps_{}_{}_{}_GRCh37.csv".format( individual1.get_var_name(), individual2.get_var_name(), individual3.get_var_name(), ), comment=self._get_csv_header(), prepend_info=False, ) return df
def main(): logger.info("start") # get filenames from openSNP data dump filenames = r.get_opensnp_datadump_filenames() filenames = [ filename for filename in filenames if "readme" not in filename and "phenotype" not in filename ] # draw a sample from the observations random.seed(1) SAMPLE_SIZE = len(filenames) # SAMPLE_SIZE = 10 samples = random.sample(range(len(filenames)), SAMPLE_SIZE) # setup tasks for parallelizing / execution on multiple cores p = Parallelizer(parallelize=True) tasks = [{"file": filenames[i]} for i in samples] # run tasks; results is a list of dicts results = p(load_file, tasks) # get results from `load_file` where `count` was non-zero rows = [item for item in results if "msg" not in item] df = pd.DataFrame( rows, columns=["file", "source", "build", "build_detected", "chromosomes", "count"], ) save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv") # log parsing statistics file_count = len(filenames) logger.info(f"{file_count} files in the openSNP datadump") logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed") logger.info( f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed" ) # extract files from the datadump where `load_file` returned a message if EXTRACT_FILES: # group files with same message (e.g., {"some message": ["file1", "file2"], ...}) d = {} for result in results: if "msg" in result: if result["msg"] in d: d[result["msg"]].append(result["file"]) else: d[result["msg"]] = [result["file"]] # add messages / file filters as necessary... d["build not detected"] = list(df.loc[~df.build_detected].file.values) # extract files that have messages for debugging for msg, files in d.items(): if len(files) == 0: continue # create a directory for each message (prefix indicates number of files) path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}") create_dir(path) # save each file with message into created directory for filename in files: with atomic_write(os.path.join(path, filename), mode="wb") as f: f.write(r.load_opensnp_datadump_file(filename)) logger.info("stop")
"build_detected", "x_snps", "heterozygous_x_snps", "y_snps", "y_snps_not_null", "count", ], ) # derive the columns we want to analyze df["heterozygous_x_snps_ratio"] = df.heterozygous_x_snps / df.x_snps df["y_snps_not_null_ratio"] = df.y_snps_not_null / df.y_snps df.drop(df.loc[df["heterozygous_x_snps_ratio"].isna()].index, inplace=True) df.drop(df.loc[df["y_snps_not_null_ratio"].isna()].index, inplace=True) plt = create_analysis_plot( df[["heterozygous_x_snps_ratio", "y_snps_not_null_ratio"]]) # save output with atomic_write( f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.png')}", mode="wb", overwrite=True, ) as f: plt.savefig(f) save_df_as_csv(df, OUTPUT_DIR, "xy-chrom-snp-ratios.csv") logger.info("stop")
def main(): logging.info("start analysis") # get filenames from openSNP data dump filenames = r.get_opensnp_datadump_filenames() # draw a sample from the observations random.seed(1) SAMPLE_SIZE = len(filenames) # SAMPLE_SIZE = 10 samples = random.sample(range(len(filenames)), SAMPLE_SIZE) # get the 1000 genomes samples dfsamples = get_1kg_samples( f"{DATA_DIR}/integrated_call_samples_v3.20130502.ALL.panel" ) logging.info("retreived the 1kg samples") aisnps_1kg = ( vcf2df(f"{DATA_DIR}/kidd.55aisnp.1kg.vcf", dfsamples) if aisnp_SET == "kidd et al. 55 aisnps" else vcf2df(f"{DATA_DIR}/Seldin.128aisnp.1kg.vcf", dfsamples) ) logging.info("made the AIsnp DataFrame") # Encode 1kg data X_encoded, encoder = encode_genotypes(aisnps_1kg) logging.info("encoded the genotypes") # perform dimensionality reduction on the 1kg set X_reduced, reducer = dimensionality_reduction( X_encoded, algorithm=DIMENSIONALITY_REDUCTION_ALGORITHM ) logging.info("Reduced the dimensionality of the genotypes") # predicted population knn_super_pop = KNeighborsClassifier( n_neighbors=9, weights="distance", n_jobs=1 ) knn_pop = KNeighborsClassifier(n_neighbors=9, weights="distance", n_jobs=1) # fit the knn before adding the user sample logging.info("Fitting the superpopulation model") knn_super_pop.fit(X_reduced, dfsamples["super population"]) logging.info("Done!") logging.info("Fitting the population model") knn_pop.fit(X_reduced, dfsamples["population"]) logging.info("Done!") # setup tasks for parallelizing / execution on multiple cores p = Parallelizer(parallelize=True) tasks = [ { "file": filenames[i], "aisnps_1kg": aisnps_1kg, "X_encoded": X_encoded, "encoder": encoder, "reducer": reducer, "knn_super_pop": knn_super_pop, "knn_pop": knn_pop, } for i in samples ] # run tasks; results is a list of dicts results = p(process_file, tasks) # get rows for dataframe summarizing results rows = [row for row in results if row] df = pd.DataFrame( rows, columns=[ "file", "source", "build", "build_detected", "chromosomes_summary", "snp_count", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI", "component1", "component2", "component3", ], ) save_df_as_csv(df, OUTPUT_DIR, "opensnp_ancestry.csv") logging.info("analysis done!")