def split_vcf(in_file, config, out_dir=None): """ split a VCF file into separate files by chromosome requires tabix to be installed """ if out_dir is None: out_dir = os.path.join(os.path.dirname(in_file), "split") fasta_file = config["ref"]["fasta"] fasta_index = fasta_file + ".fai" samtools_path = config["program"].get("samtools", "samtools") tabix_path = config["program"].get("tabix", "tabix") if not file_exists(fasta_index): samtools = sh.Command(samtools_path) samtools.faidx(fasta_file) # if in_file is not compressed, compress it (_, ext) = os.path.splitext(in_file) if ext is not ".gz": gzip_file = in_file + ".gz" if not file_exists(gzip_file): sh.bgzip("-c", in_file, _out=gzip_file) in_file = gzip_file # create tabix index tabix_index(in_file) # find the chromosome names from the fasta index file chroms = str(sh.cut("-f1", fasta_index)).split() # make outfile from chromosome name def chr_out(chrom): out_file = replace_suffix(append_stem(in_file, chrom), ".vcf") return os.path.join(out_dir, os.path.basename(out_file)) # run tabix to break up the vcf file def run_tabix(chrom): tabix = sh.Command(tabix_path) out_file = chr_out(chrom) if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: tabix("-h", in_file, chrom, _out=tmp_out_file) return out_file out_files = map(run_tabix, chroms) return out_files
def index_gff(gff, compress): """Index and optionally compress a GFF file. \b Examples: bionorm index_gff Medicago_truncatula/jemalong_A17.gnm5.ann1.FAKE/medtr.jemalong_A17.gnm5.ann1.FAKE.gene_models_main.gff3 """ from sh import bgzip # isort:skip from sh import tabix # isort:skip target = Path(gff) if len(target.suffixes) < 1: error_message = f"Target {target} does not have a file extension." logger.error(error_message) sys.exit(1) if target.suffix.lstrip(".") in COMPRESSED_TYPES: logger.error(f"Uncompress {target} befor indexing.") sys.exit(1) if target.suffix.lstrip(".") not in GFF_TYPES: logger.error( f"File {target} does not have a recognized GFF extension.") sys.exit(1) if compress: output = bgzip(["-f", str(target)]) target = Path(target.parent) / f"{target.name}.gz" output = tabix(["-p", "gff", str(target)]) output = tabix(["--csi", "-p", "gff", str(target)]) return target
def index_fasta(fasta, compress): """Index and optionally compress a fasta file. \b Examples: bionorm index_fasta Medicago_truncatula/jemalong_A17.gnm5.ann1.FAKE/medtr.jemalong_A17.gnm5.FAKE.genome_main.fna """ from sh import bgzip # isort:skip from sh import samtools # isort:skip target = Path(fasta) if len(target.suffixes) < 1: error_message = f"Target {target} does not have a file extension." logger.error(error_message) sys.exit(1) if target.suffix.lstrip(".") in COMPRESSED_TYPES: logger.error(f"Uncompress {target} befor indexing.") sys.exit(1) if target.suffix.lstrip(".") not in FASTA_TYPES: logger.error( f"File {target} does not have a recognized FASTA extension.") sys.exit(1) if compress: output = bgzip(["-f", "--index", str(target)]) target = Path(target.parent) / f"{target.name}.gz" output = samtools(["faidx", str(target)]) return target
def _break_vcf(self, in_file): if not file_exists(self.fasta_index): sh.samtools.faidx(self.fasta_file) # if file is not compressed, compress it (_, ext) = os.path.splitext(in_file) if ext is not ".gz": gzip_file = in_file + ".gz" sh.bgzip("-c", in_file, _out=gzip_file) in_file = gzip_file # create tabix index if it does not exist already if not file_exists(in_file + ".tbi"): sh.tabix("-p", "vcf", in_file) # find the chromosome names from the fasta index file chroms = str(sh.cut("-f1", self.fasta_index)).split() break_dir = os.path.join(os.path.dirname(in_file), "break") safe_makedir(break_dir) def chr_out(chrom): out_file = os.path.join(break_dir, append_stem(in_file, chrom)) out_file = replace_suffix(out_file, "vcf") return out_file def tabix(chrom): out_file = chr_out(chrom) if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.tabix("-h", in_file, chrom, _out=tmp_out_file) return out_file # use tabix to separate out the variants based on chromosome out_files = map(tabix, chroms) return out_files