def _write_csv(self): """ Write SNPs to a CSV file. Returns ------- str path to file in output directory if SNPs were saved, else empty str """ filename = self._filename if not filename: ext = ".txt" if "sep" in self._kwargs and self._kwargs["sep"] == ",": ext = ".csv" filename = "{}_{}{}".format(clean_str(self._snps.source), self._snps.assembly, ext) comment = ("# Source(s): {}\n" "# Build: {}\n" "# Build Detected: {}\n" "# Phased: {}\n" "# SNPs: {}\n" "# Chromosomes: {}\n".format( self._snps.source, self._snps.build, self._snps.build_detected, self._snps.phased, self._snps.count, self._snps.chromosomes_summary, )) if "header" in self._kwargs: if isinstance(self._kwargs["header"], bool): if self._kwargs["header"]: self._kwargs["header"] = [ "chromosome", "position", "genotype" ] else: self._kwargs["header"] = ["chromosome", "position", "genotype"] return save_df_as_csv(self._snps._snps, self._snps._output_dir, filename, comment=comment, atomic=self._atomic, **self._kwargs)
def _write_vcf(self): """ Write SNPs to a VCF file. References ---------- 1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019, https://samtools.github.io/hts-specs/VCFv4.2.pdf Returns ------- str path to file in output directory if SNPs were saved, else empty str discrepant_vcf_position : pd.DataFrame SNPs with discrepant positions discovered while saving VCF """ filename = self._filename if not filename: filename = "{}_{}{}".format(clean_str(self._snps.source), self._snps.assembly, ".vcf") comment = ( "##fileformat=VCFv4.2\n" "##fileDate={}\n" '##source="{}; snps v{}; https://pypi.org/project/snps/"\n'.format( datetime.datetime.utcnow().strftime("%Y%m%d"), self._snps.source, snps.__version__, )) reference_sequence_chroms = ( "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT", ) df = self._snps.snps tasks = [] # skip insertions and deletions df = df.drop(df.loc[df["genotype"].notnull() & ((df["genotype"].str[0] == "I") | (df["genotype"].str[0] == "D") | (df["genotype"].str[1] == "I") | (df["genotype"].str[1] == "D"))].index) chroms_to_drop = [] for chrom in df["chrom"].unique(): if chrom not in reference_sequence_chroms: chroms_to_drop.append(chrom) continue tasks.append({ "resources": self._snps._resources, "assembly": self._snps.assembly, "chrom": chrom, "snps": pd.DataFrame(df.loc[(df["chrom"] == chrom)]), }) # drop chromosomes without reference sequence data (e.g., unassigned PAR) for chrom in chroms_to_drop: df = df.drop(df.loc[df["chrom"] == chrom].index) # create the VCF representation for SNPs results = map(self._create_vcf_representation, tasks) contigs = [] vcf = [pd.DataFrame()] discrepant_vcf_position = [pd.DataFrame()] for result in list(results): contigs.append(result["contig"]) vcf.append(result["vcf"]) discrepant_vcf_position.append(result["discrepant_vcf_position"]) vcf = pd.concat(vcf) discrepant_vcf_position = pd.concat(discrepant_vcf_position) comment += "".join(contigs) comment += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n' comment += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n" return ( save_df_as_csv( vcf, self._snps._output_dir, filename, comment=comment, prepend_info=False, header=False, index=False, na_rep=".", sep="\t", ), discrepant_vcf_position, )
def get_var_name(self): return clean_str(self.name)
def main(): logger.info("start") # get filenames from openSNP data dump filenames = r.get_opensnp_datadump_filenames() filenames = [ filename for filename in filenames if "readme" not in filename and "phenotype" not in filename ] # draw a sample from the observations random.seed(1) SAMPLE_SIZE = len(filenames) # SAMPLE_SIZE = 10 samples = random.sample(range(len(filenames)), SAMPLE_SIZE) # setup tasks for parallelizing / execution on multiple cores p = Parallelizer(parallelize=True) tasks = [{"file": filenames[i]} for i in samples] # run tasks; results is a list of dicts results = p(load_file, tasks) # get results from `load_file` where `count` was non-zero rows = [item for item in results if "msg" not in item] df = pd.DataFrame( rows, columns=[ "file", "source", "build", "build_detected", "chromosomes", "count" ], ) save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv") # log parsing statistics file_count = len(filenames) logger.info("{} files in the openSNP datadump".format(file_count)) logger.info("{:.2%} of openSNP datadump files parsed".format( len(df) / file_count)) logger.info("build detected in {:.2%} of files parsed".format( len(df.loc[df.build_detected]) / len(df))) # extract files from the datadump where `load_file` returned a message if EXTRACT_FILES: # group files with same message (e.g., {"some message": ["file1", "file2"], ...}) d = {} for result in results: if "msg" in result: if result["msg"] in d: d[result["msg"]].append(result["file"]) else: d[result["msg"]] = [result["file"]] # add messages / file filters as necessary... d["build not detected"] = list(df.loc[~df.build_detected].file.values) # extract files that have messages for debugging for msg, files in d.items(): if len(files) == 0: continue # create a directory for each message (prefix indicates number of files) path = os.path.join(OUTPUT_DIR, "{:04}_{}".format(len(files), clean_str(msg))) create_dir(path) # save each file with message into created directory for filename in files: with atomic_write(os.path.join(path, filename), mode="wb") as f: f.write(r.load_opensnp_datadump_file(filename)) logger.info("stop")