def compress_vcf(path, data_path, file_name): data_bgz = None with open(path, 'rb') as data_stream: data_bgz = "{}/{}.bgz".format(data_path, file_name) with open(data_bgz, 'wb') as bgz_stream: with bgzip.BGZipWriter(bgz_stream) as fh: fh.writelines(data_stream.readlines()) return data_bgz
def write_body_in_gz_file(self): """ Writes body in the compressed file, or on the stdout, regarding input arguments. """ if self.path: with open(self.path, "a+b") as raw: with bgzip.BGZipWriter(raw) as self.file: for list_item in self.list_of_body_records_chrom: self.file.write(list_item.line.encode('utf-8')) else: for list_item in self.list_of_body_records_chrom: print(list_item.line.encode('utf-8'))
def created_vcf_tabix_files(vcf_content): _, tmp_input_path = tempfile.mkstemp(prefix='vw_test_file_', suffix='.vcf.gz') with open(tmp_input_path, 'wb') as raw_fd: with bgzip.BGZipWriter(raw_fd) as fh: fh.write(vcf_content) tabix_cmd_response = subprocess.run(['tabix', '-p', 'vcf', raw_fd.name]) tabix_cmd_response.check_returncode() files_path = (raw_fd.name, raw_fd.name + ".tbi") created_files.append(files_path) return files_path
def write_header_in_gz_file(self): """ Writes header in the compressed file, or on the stdout, regarding input arguments. """ if self.path: with open(self.path, "w+b") as raw: with bgzip.BGZipWriter(raw) as self.file: self.file.write(self.version.encode('utf-8')) for list_item in self.list_of_header_objects: self.file.write(list_item.line.encode('utf-8')) self.file.write(self.body_header_line.line.encode('utf-8')) else: for list_item in self.list_of_header_objects: print(list_item.line.encode('utf-8'))
def partialize_vcf(uri: str, number_of_lines: int, zip_format: str="bgzip") -> bytes: with io.BytesIO() as raw: if "bgzip" == zip_format: zip_writer = bgzip.BGZipWriter(raw) elif "gzip" == zip_format: zip_writer = gzip.GzipFile(fileobj=raw, mode="w") else: raise ValueError("Supported values for `zip_format` are 'bgzip' and 'gzip'") with zip_writer as writer: for line in _vcf_lines(uri, number_of_lines): writer.write(line) out = raw.getvalue() return out
def test_profile_write(self): print() with open("tests/fixtures/partial.vcf.gz", "rb") as raw: with gzip.GzipFile(fileobj=raw) as fh: inflated_data = fh.read() with profile("gzip write"): with gzip.GzipFile(fileobj=io.BytesIO(), mode="w") as fh: fh.write(inflated_data) for num_threads in range(1, 1 + bgzip.available_cores): with profile(f"bgzip write (num_threads={num_threads})"): with bgzip.BGZipWriter(io.BytesIO(), num_threads=num_threads) as writer: n = 987345 writer.write(inflated_data[:n]) writer.write(inflated_data[n:]) print()
def install_cache_manual_fasta(vep_version, assembly, cache_path, homo_sapiens_path, index_path): dest_dir = "{}/{}_{}".format(homo_sapiens_path, vep_version, assembly) fasta_names = { "GRCh37": "Homo_sapiens.GRCh37.75.dna." "primary_assembly.fa.gz", "GRCh38": "Homo_sapiens.GRCh38.dna.toplevel.fa.gz" } fasta_name = fasta_names[assembly] fasta_type = {"GRCh37": "dna", "GRCh38": "dna_index"} download_file( "ftp://ftp.ensembl.org/pub/release-{vep_ver}" "/fasta/homo_sapiens/{type}/" "{fasta_name}".format( vep_ver=75 if assembly == 'GRCh37' else vep_version, type=fasta_type[assembly], fasta_name=fasta_name), "{}/{fasta_name}".format(dest_dir, fasta_name=fasta_name), 1800) fasta_path = "{}/{}".format(dest_dir, fasta_name) file_info = magic.from_file(fasta_path) # convert gzip to bgzip if necessary for indexing # bgzipped files are described as "gzip file with extra field" if "extra field" not in file_info: contents = None with gzip.open(fasta_path, 'r') as gzip_stream: contents = gzip_stream.readlines() with open(fasta_path, 'wb') as bgzip_stream: with bgzip.BGZipWriter(bgzip_stream) as fh: fh.writelines(contents) get_fai_and_gzi = "samtools faidx {dest}/{fasta_name}" \ .format(dest=dest_dir, fasta_name=fasta_names[assembly]) results = subprocess.run(get_fai_and_gzi.split()) if results.returncode == 0: open(index_path, 'w').close() else: raise ReferenceDownloadError("Error creating index. Please try again.")
def test_pathalogical_write(self): fh = io.BytesIO() with bgzip.BGZipWriter(fh): fh.write(b"")