Exemple #1
0
def index_gff(gff, compress):
    """Index and optionally compress a GFF file.

        \b
    Examples:
        bionorm index_gff Medicago_truncatula/jemalong_A17.gnm5.ann1.FAKE/medtr.jemalong_A17.gnm5.ann1.FAKE.gene_models_main.gff3
    """
    from sh import bgzip  # isort:skip
    from sh import tabix  # isort:skip

    target = Path(gff)
    if len(target.suffixes) < 1:
        error_message = f"Target {target} does not have a file extension."
        logger.error(error_message)
        sys.exit(1)
    if target.suffix.lstrip(".") in COMPRESSED_TYPES:
        logger.error(f"Uncompress {target} befor indexing.")
        sys.exit(1)
    if target.suffix.lstrip(".") not in GFF_TYPES:
        logger.error(
            f"File {target} does not have a recognized GFF extension.")
        sys.exit(1)
    if compress:
        output = bgzip(["-f", str(target)])
        target = Path(target.parent) / f"{target.name}.gz"
    output = tabix(["-p", "gff", str(target)])
    output = tabix(["--csi", "-p", "gff", str(target)])
    return target
Exemple #2
0
 def tabix(chrom):
     out_file = chr_out(chrom)
     if file_exists(out_file):
         return out_file
     with file_transaction(out_file) as tmp_out_file:
         sh.tabix("-h", in_file, chrom, _out=tmp_out_file)
     return out_file
Exemple #3
0
 def tabix(chrom):
     out_file = chr_out(chrom)
     if file_exists(out_file):
         return out_file
     with file_transaction(out_file) as tmp_out_file:
         sh.tabix("-h", in_file, chrom, _out=tmp_out_file)
     return out_file
Exemple #4
0
    def _break_vcf(self, in_file):
        if not file_exists(self.fasta_index):
            sh.samtools.faidx(self.fasta_file)

        # if file is not compressed, compress it
        (_, ext) = os.path.splitext(in_file)
        if ext is not ".gz":
            gzip_file = in_file + ".gz"
            sh.bgzip("-c", in_file, _out=gzip_file)
            in_file = gzip_file

        # create tabix index if it does not exist already
        if not file_exists(in_file + ".tbi"):
            sh.tabix("-p", "vcf", in_file)

        # find the chromosome names from the fasta index file
        chroms = str(sh.cut("-f1", self.fasta_index)).split()
        break_dir = os.path.join(os.path.dirname(in_file), "break")
        safe_makedir(break_dir)

        def chr_out(chrom):
            out_file = os.path.join(break_dir, append_stem(in_file, chrom))
            out_file = replace_suffix(out_file, "vcf")
            return out_file

        def tabix(chrom):
            out_file = chr_out(chrom)
            if file_exists(out_file):
                return out_file
            with file_transaction(out_file) as tmp_out_file:
                sh.tabix("-h", in_file, chrom, _out=tmp_out_file)
            return out_file

        # use tabix to separate out the variants based on chromosome
        out_files = map(tabix, chroms)

        return out_files
Exemple #5
0
    def _break_vcf(self, in_file):
        if not file_exists(self.fasta_index):
            sh.samtools.faidx(self.fasta_file)

        # if file is not compressed, compress it
        (_, ext) = os.path.splitext(in_file)
        if ext is not ".gz":
            gzip_file = in_file + ".gz"
            sh.bgzip("-c", in_file, _out=gzip_file)
            in_file = gzip_file

        # create tabix index if it does not exist already
        if not file_exists(in_file + ".tbi"):
            sh.tabix("-p", "vcf", in_file)

        # find the chromosome names from the fasta index file
        chroms = str(sh.cut("-f1", self.fasta_index)).split()
        break_dir = os.path.join(os.path.dirname(in_file), "break")
        safe_makedir(break_dir)

        def chr_out(chrom):
            out_file = os.path.join(break_dir, append_stem(in_file, chrom))
            out_file = replace_suffix(out_file, "vcf")
            return out_file

        def tabix(chrom):
            out_file = chr_out(chrom)
            if file_exists(out_file):
                return out_file
            with file_transaction(out_file) as tmp_out_file:
                sh.tabix("-h", in_file, chrom, _out=tmp_out_file)
            return out_file

        # use tabix to separate out the variants based on chromosome
        out_files = map(tabix, chroms)

        return out_files