Exemple #1
0
def compress_vcf(path, data_path, file_name):
    data_bgz = None
    with open(path, 'rb') as data_stream:
        data_bgz = "{}/{}.bgz".format(data_path, file_name)
        with open(data_bgz, 'wb') as bgz_stream:
            with bgzip.BGZipWriter(bgz_stream) as fh:
                fh.writelines(data_stream.readlines())
    return data_bgz
 def write_body_in_gz_file(self):
     """ Writes body in the compressed file, or on the stdout, regarding input arguments. """
     if self.path:
         with open(self.path, "a+b") as raw:
             with bgzip.BGZipWriter(raw) as self.file:
                 for list_item in self.list_of_body_records_chrom:
                     self.file.write(list_item.line.encode('utf-8'))
     else:
         for list_item in self.list_of_body_records_chrom:
             print(list_item.line.encode('utf-8'))
Exemple #3
0
 def created_vcf_tabix_files(vcf_content):
     _, tmp_input_path = tempfile.mkstemp(prefix='vw_test_file_', suffix='.vcf.gz')
     with open(tmp_input_path, 'wb') as raw_fd:
         with bgzip.BGZipWriter(raw_fd) as fh:
             fh.write(vcf_content)
     tabix_cmd_response = subprocess.run(['tabix', '-p', 'vcf', raw_fd.name])
     tabix_cmd_response.check_returncode()
     files_path = (raw_fd.name, raw_fd.name + ".tbi")
     created_files.append(files_path)
     return files_path
 def write_header_in_gz_file(self):
     """ Writes header in the compressed file, or on the stdout, regarding input arguments. """
     if self.path:
         with open(self.path, "w+b") as raw:
             with bgzip.BGZipWriter(raw) as self.file:
                 self.file.write(self.version.encode('utf-8'))
                 for list_item in self.list_of_header_objects:
                     self.file.write(list_item.line.encode('utf-8'))
                 self.file.write(self.body_header_line.line.encode('utf-8'))
     else:
         for list_item in self.list_of_header_objects:
             print(list_item.line.encode('utf-8'))
def partialize_vcf(uri: str, number_of_lines: int, zip_format: str="bgzip") -> bytes:
    with io.BytesIO() as raw:
        if "bgzip" == zip_format:
            zip_writer = bgzip.BGZipWriter(raw)
        elif "gzip" == zip_format:
            zip_writer = gzip.GzipFile(fileobj=raw, mode="w")
        else:
            raise ValueError("Supported values for `zip_format` are 'bgzip' and 'gzip'")
        with zip_writer as writer:
            for line in _vcf_lines(uri, number_of_lines):
                writer.write(line)
        out = raw.getvalue()
    return out
Exemple #6
0
    def test_profile_write(self):
        print()
        with open("tests/fixtures/partial.vcf.gz", "rb") as raw:
            with gzip.GzipFile(fileobj=raw) as fh:
                inflated_data = fh.read()

        with profile("gzip write"):
            with gzip.GzipFile(fileobj=io.BytesIO(), mode="w") as fh:
                fh.write(inflated_data)

        for num_threads in range(1, 1 + bgzip.available_cores):
            with profile(f"bgzip write (num_threads={num_threads})"):
                with bgzip.BGZipWriter(io.BytesIO(),
                                       num_threads=num_threads) as writer:
                    n = 987345
                    writer.write(inflated_data[:n])
                    writer.write(inflated_data[n:])
        print()
Exemple #7
0
def install_cache_manual_fasta(vep_version, assembly, cache_path,
                               homo_sapiens_path, index_path):
    dest_dir = "{}/{}_{}".format(homo_sapiens_path, vep_version, assembly)
    fasta_names = {
        "GRCh37": "Homo_sapiens.GRCh37.75.dna."
        "primary_assembly.fa.gz",
        "GRCh38": "Homo_sapiens.GRCh38.dna.toplevel.fa.gz"
    }
    fasta_name = fasta_names[assembly]

    fasta_type = {"GRCh37": "dna", "GRCh38": "dna_index"}

    download_file(
        "ftp://ftp.ensembl.org/pub/release-{vep_ver}"
        "/fasta/homo_sapiens/{type}/"
        "{fasta_name}".format(
            vep_ver=75 if assembly == 'GRCh37' else vep_version,
            type=fasta_type[assembly],
            fasta_name=fasta_name),
        "{}/{fasta_name}".format(dest_dir, fasta_name=fasta_name), 1800)

    fasta_path = "{}/{}".format(dest_dir, fasta_name)
    file_info = magic.from_file(fasta_path)

    # convert gzip to bgzip if necessary for indexing
    # bgzipped files are described as "gzip file with extra field"
    if "extra field" not in file_info:
        contents = None
        with gzip.open(fasta_path, 'r') as gzip_stream:
            contents = gzip_stream.readlines()
        with open(fasta_path, 'wb') as bgzip_stream:
            with bgzip.BGZipWriter(bgzip_stream) as fh:
                fh.writelines(contents)

    get_fai_and_gzi = "samtools faidx {dest}/{fasta_name}" \
                      .format(dest=dest_dir, fasta_name=fasta_names[assembly])

    results = subprocess.run(get_fai_and_gzi.split())
    if results.returncode == 0:
        open(index_path, 'w').close()
    else:
        raise ReferenceDownloadError("Error creating index. Please try again.")
Exemple #8
0
 def test_pathalogical_write(self):
     fh = io.BytesIO()
     with bgzip.BGZipWriter(fh):
         fh.write(b"")