Esempio n. 1
0
    def after_genome_download(self, genome):
        if not cmd_ok("gmap_build"):
            return

        # Create index dir
        index_dir = genome.props["gmap"]["index_dir"]
        mkdir_p(index_dir)

        # If the genome is bgzipped it needs to be unzipped first
        fname = genome.filename
        bgzip = False
        if fname.endswith(".gz"):
            ret = sp.check_call(["gunzip", fname])
            if ret != 0:
                raise Exception("Error gunzipping genome {}".format(fname))
            fname = re.sub(".gz$", "", fname)
            bgzip = True

        # Create index
        cmd = "gmap_build -D {} -d {} {}".format(index_dir, genome.name,
                                                 genome.filename)
        run_index_cmd("gmap", cmd)

        if bgzip:
            ret = sp.check_call(["bgzip", fname])
            if ret != 0:
                raise Exception("Error bgzipping genome {}. ".format(fname) +
                                "Is tabix installed?")
Esempio n. 2
0
    def after_genome_download(self, genome, force=False):
        if not cmd_ok("hisat2-build"):
            return

        # Create index dir
        index_dir = genome.props["hisat2"]["index_dir"]
        index_name = genome.props["hisat2"]["index_name"]
        if force:
            # Start from scratch
            rmtree(index_dir, ignore_errors=True)
        mkdir_p(index_dir)

        if not any(fname.endswith(".ht2") for fname in os.listdir(index_dir)):
            # If the genome is bgzipped it needs to be unzipped first
            fname = genome.filename
            bgzip = False
            if fname.endswith(".gz"):
                ret = sp.check_call(["gunzip", fname])
                if ret != 0:
                    raise Exception("Error gunzipping genome {}".format(fname))
                fname = re.sub(".gz$", "", fname)
                bgzip = True

            # Create index
            cmd = "hisat2-build {} {}".format(fname, index_name)
            run_index_cmd("hisat2", cmd)

            if bgzip:
                ret = sp.check_call(["bgzip", fname])
                if ret != 0:
                    raise Exception(
                        "Error bgzipping genome {}. ".format(fname) +
                        "Is tabix installed?")
Esempio n. 3
0
def genome(request, tempdir):
    """Create a test genome."""
    name = "dm3"  # Use fake name for blacklist test
    fafile = "tests/data/small_genome.fa"
    bgzipped = True if request.param == "bgzipped" else False

    # Input needs to be bgzipped, depending on param
    if os.path.exists(fafile + ".gz"):
        if not bgzipped:
            check_call(["gunzip", fafile + ".gz"])
    elif bgzipped:
        check_call(["bgzip", fafile])

    tmpdir = os.path.join(tempdir, request.param, name)
    mkdir_p(tmpdir)

    if bgzipped:
        fafile += ".gz"

    copyfile(fafile, os.path.join(tmpdir, os.path.basename(fafile)))
    for p in init_plugins():
        activate(p)
    # provide the fixture value
    yield Genome(name, genome_dir=os.path.join(tempdir, request.param))
    if os.path.exists(fafile) and not bgzipped:
        check_call(["bgzip", fafile])
Esempio n. 4
0
    def after_genome_download(self, genome, force=False):
        if not cmd_ok("STAR"):
            return

        # Create index dir
        index_dir = genome.props["star"]["index_dir"]
        index_name = genome.props["star"]["index_name"]
        if force:
            # Start from scratch
            rmtree(index_dir, ignore_errors=True)
        mkdir_p(index_dir)

        if not os.path.exists(index_name):
            # If the genome is bgzipped it needs to be unzipped first
            fname = genome.filename
            bgzip = False
            if fname.endswith(".gz"):
                ret = sp.check_call(["gunzip", fname])
                if ret != 0:
                    raise Exception("Error gunzipping genome {}".format(fname))
                fname = re.sub(".gz$", "", fname)
                bgzip = True

            # Create index
            cmd = "STAR --runMode genomeGenerate --genomeFastaFiles {} --genomeDir {} --outFileNamePrefix {}".format(
                fname, index_dir, index_dir)
            run_index_cmd("star", cmd)

            # Rezip genome if it was bgzipped
            if bgzip:
                ret = sp.check_call(["bgzip", fname])
                if ret != 0:
                    raise Exception(
                        "Error bgzipping genome {}. ".format(fname) +
                        "Is tabix installed?")
Esempio n. 5
0
    def after_genome_download(self, genome, threads=1, force=False):
        index_name = genome.plugin["star"]["index_name"]
        if not cmd_ok("STAR") or (os.path.exists(index_name) and not force):
            return

        index_dir = genome.plugin["star"]["index_dir"]
        rm_rf(index_dir)
        mkdir_p(index_dir)

        # gunzip genome if bgzipped and return up-to-date genome name
        with extracted_file(genome.filename) as fname:
            # index command
            cmd = (f"STAR --runMode genomeGenerate --runThreadN {threads} " +
                   f"--genomeFastaFiles {fname} --genomeDir {index_dir} " +
                   f"--outFileNamePrefix {index_dir}")

            # if an annotation is present, generate a splice-aware index
            gtf_file = genome.annotation_gtf_file
            if gtf_file:
                with extracted_file(gtf_file) as _gtf_file:
                    # update index command with annotation
                    cmd += f" --sjdbGTFfile {_gtf_file}"

                    # Create index
                    run_index_cmd("star", cmd)
            else:
                logger.info("Creating STAR index without annotation file.")
                # Create index
                run_index_cmd("star", cmd)
Esempio n. 6
0
def download_annotation(name, annot, genomes_dir, localname, n=None):
    """
    Download the extended genePred file from the UCSC MySQL database.
    Next convert this to a BED and GTF file.
    """
    out_dir = os.path.join(genomes_dir, localname)
    mkdir_p(out_dir)
    tmp_dir = mkdtemp(dir=out_dir)
    pred_file = f"{os.path.join(tmp_dir, localname)}.annotation.extended.gp"
    gtf_file = f"{os.path.join(out_dir, localname)}.annotation.gtf"
    bed_file = f"{os.path.join(out_dir, localname)}.annotation.bed"

    # MySQL query 1: get column names for this genePred
    command = f"SHOW COLUMNS FROM {annot};"
    cols = list(query_ucsc(command, database=name))

    # drop columns the UCSC tools cannot handle
    # see https://genome.ucsc.edu/FAQ/FAQformat.html#format9
    accepted_cols = [
        "geneName",
        "name",
        "chrom",
        "strand",
        "txStart",
        "txEnd",
        "cdsStart",
        "cdsEnd",
        "exonCount",
        "exonStarts",
        "exonEnds",
        "score",
        "name2",
        "cdsStartStat",
        "cdsEndStat",
        "exonFrames",
    ]
    cols = [c[0] for c in cols if c[0] in accepted_cols]
    cols = ",".join(cols)

    # MySQL query 2: download genePred
    command = f"SELECT {cols} FROM {annot};"
    if n:
        command = f"SELECT {cols} FROM {annot} LIMIT {n};"
    ret = query_ucsc(command, database=name)

    # clean up genePred
    df = pd.DataFrame.from_records(ret)
    for c in [8, 9, 14]:
        if c in df:
            df[c] = df[c].str.decode("utf-8")
    df.to_csv(pred_file, index=False, header=False, sep="\t")

    # convert genePred to GTF and BED
    cmd = "genePredToGtf -source=genomepy file {0} {1}"
    sp.check_call(cmd.format(pred_file, gtf_file), shell=True)
    cmd = "genePredToBed {0} {1}"
    sp.check_call(cmd.format(pred_file, bed_file), shell=True)
    rm_rf(tmp_dir)
Esempio n. 7
0
    def after_genome_download(self, genome):
        if not cmd_ok("minimap2"):
            return

        # Create index dir
        index_dir = genome.props["minimap2"]["index_dir"]
        index_name =  genome.props["minimap2"]["index_name"] 
        mkdir_p(index_dir)

        # Create index
        cmd = "minimap2 -d {} {}".format(index_name, genome.filename)
        run_index_cmd("minimap2", cmd)
Esempio n. 8
0
    def after_genome_download(self, genome, threads=1, force=False):
        index_name = genome.plugin["hisat2"]["index_name"]
        if not cmd_ok("hisat2-build") or (
            os.path.exists(f"{index_name}.1.ht2") and not force
        ):
            return

        index_dir = genome.plugin["hisat2"]["index_dir"]
        rm_rf(index_dir)
        mkdir_p(index_dir)

        # gunzip genome if bgzipped and return up-to-date genome name
        fname, bgzip = gunzip_and_name(genome.filename)

        # index command
        cmd = f"hisat2-build -p {threads} {fname} {index_name}"

        # if an annotation is present, generate a splice-aware index
        gtf_file = genome.annotation_gtf_file
        if gtf_file:
            # gunzip if gzipped
            gtf_file, gzip_file = gunzip_and_name(gtf_file)

            # generate splice and exon site files to enhance indexing
            hisat_path = (
                sp.Popen("which hisat2", stdout=sp.PIPE, shell=True)
                .stdout.read()
                .decode("utf8")
                .strip()
            )
            splice_script = hisat_path + "_extract_splice_sites.py"
            splice_file = os.path.join(genome.genome_dir, "splice_sites.txt")
            sp.check_call(
                f"python3 {splice_script} {gtf_file} > {splice_file}", shell=True
            )

            exon_script = hisat_path + "_extract_exons.py"
            exon_file = os.path.join(genome.genome_dir, "exon_sites.txt")
            sp.check_call(f"python3 {exon_script} {gtf_file} > {exon_file}", shell=True)

            # re-gzip annotation if gunzipped
            gzip_and_name(gtf_file, gzip_file)

            # update index command with annotation
            cmd += f" --ss {splice_file} --exon {exon_file}"
        else:
            print("\nCreating Hisat2 index without annotation file.")

        # Create index
        run_index_cmd("hisat2", cmd)

        # re-bgzip genome if gunzipped
        bgzip_and_name(fname, bgzip)
Esempio n. 9
0
    def after_genome_download(self, genome):
        if not cmd_ok("bowtie2-build"):
            return
        
        # Create index dir
        index_dir = genome.props["bowtie2"]["index_dir"]
        index_name =  genome.props["bowtie2"]["index_name"] 
        mkdir_p(index_dir)

        # Create index
        cmd = "bowtie2-build {} {}".format(genome.filename, index_name)
        run_index_cmd("bowtie2", cmd)
Esempio n. 10
0
    def after_genome_download(self, genome):
        if not cmd_ok("minimap2"):
            return

        # Create index dir
        index_dir = genome.props["minimap2"]["index_dir"]
        index_name = genome.props["minimap2"]["index_name"]
        mkdir_p(index_dir)

        # Create index
        cmd = "minimap2 -d {} {}".format(index_name, genome.filename)
        run_index_cmd("minimap2", cmd)
Esempio n. 11
0
    def after_genome_download(self, genome):
        if not cmd_ok("bowtie2-build"):
            return

        # Create index dir
        index_dir = genome.props["bowtie2"]["index_dir"]
        index_name = genome.props["bowtie2"]["index_name"]
        mkdir_p(index_dir)

        # Create index
        cmd = "bowtie2-build {} {}".format(genome.filename, index_name)
        run_index_cmd("bowtie2", cmd)
Esempio n. 12
0
    def after_genome_download(self, genome):
        if not cmd_ok("gmap_build"):
            return

        # Create index dir
        index_dir = genome.props["gmap"]["index_dir"]
        index_name = genome.props["gmap"]["index_name"]
        mkdir_p(index_dir)

        # Create index
        cmd = "gmap_build -D {} -d {} {}".format(index_dir, genome.name,
                                                 genome.filename)
        run_index_cmd("gmap", cmd)
Esempio n. 13
0
    def after_genome_download(self, genome):
        if not cmd_ok("gmap_build"):
            return

        # Create index dir
        index_dir = genome.props["gmap"]["index_dir"]
        index_name =  genome.props["gmap"]["index_name"] 
        mkdir_p(index_dir)

        # Create index
        cmd = "gmap_build -D {} -d {} {}".format(
                index_dir, genome.name, genome.filename)
        run_index_cmd("gmap", cmd)
Esempio n. 14
0
    def after_genome_download(self, genome):
        if not cmd_ok("bwa"):
            return

        # Create index dir
        index_dir = genome.props["bwa"]["index_dir"]
        index_fa = genome.props["bwa"]["index_name"]
        mkdir_p(index_dir)

        if not os.path.exists(index_fa):
            os.symlink(genome.filename, index_fa)

        cmd = "bwa index {}".format(index_fa)
        run_index_cmd("bwa", cmd)
Esempio n. 15
0
    def after_genome_download(self, genome):
        if not cmd_ok("bwa"):
            return
        
        # Create index dir
        index_dir = genome.props["bwa"]["index_dir"]
        index_fa =  genome.props["bwa"]["index_name"] 
        mkdir_p(index_dir)

        if not os.path.exists(index_fa):
            os.symlink(genome.filename, index_fa)

        cmd = "bwa index {}".format(index_fa)
        run_index_cmd("bwa", cmd)
Esempio n. 16
0
    def after_genome_download(self, genome, force=False):
        if not cmd_ok("bowtie2-build"):
            return

        # Create index dir
        index_dir = genome.props["bowtie2"]["index_dir"]
        index_name = genome.props["bowtie2"]["index_name"]
        if force:
            # Start from scratch
            rmtree(index_dir, ignore_errors=True)
        mkdir_p(index_dir)

        if not any(fname.endswith(".bt2") for fname in os.listdir(index_dir)):
            # Create index
            cmd = "bowtie2-build {} {}".format(genome.filename, index_name)
            run_index_cmd("bowtie2", cmd)
Esempio n. 17
0
    def after_genome_download(self, genome, threads=1, force=False):
        if not cmd_ok("minimap2"):
            return

        # Create index dir
        index_dir = genome.plugin["minimap2"]["index_dir"]
        index_name = genome.plugin["minimap2"]["index_name"]
        if force:
            # Start from scratch
            rm_rf(index_dir)
        mkdir_p(index_dir)

        if not any(fname.endswith(".mmi") for fname in os.listdir(index_dir)):
            # Create index
            cmd = f"minimap2 -t {threads} -d {index_name} {genome.filename}"
            run_index_cmd("minimap2", cmd)
Esempio n. 18
0
    def after_genome_download(self, genome, threads=1, force=False):
        if not cmd_ok("bwa"):
            return

        # Create index dir
        index_dir = genome.plugin["bwa"]["index_dir"]
        index_name = genome.plugin["bwa"]["index_name"]
        if force:
            # Start from scratch
            rm_rf(index_dir)
        mkdir_p(index_dir)

        if not any(fname.endswith(".bwt") for fname in os.listdir(index_dir)):
            # Create index
            if not os.path.exists(index_name):
                os.symlink(genome.filename, index_name)
            cmd = f"bwa index {index_name}"
            run_index_cmd("bwa", cmd)
Esempio n. 19
0
    def after_genome_download(self, genome, force=False):
        if not cmd_ok("bwa"):
            return

        # Create index dir
        index_dir = genome.props["bwa"]["index_dir"]
        index_name = genome.props["bwa"]["index_name"]
        if force:
            # Start from scratch
            rmtree(index_dir, ignore_errors=True)
        mkdir_p(index_dir)

        if not any(fname.endswith(".bwt") for fname in os.listdir(index_dir)):
            # Create index
            if not os.path.exists(index_name):
                os.symlink(genome.filename, index_name)

            cmd = "bwa index {}".format(index_name)
            run_index_cmd("bwa", cmd)
Esempio n. 20
0
def download_assembly_report(acc: str, fname: str = None, quiet=False):
    """
    Retrieve the NCBI assembly report.

    Returns the assembly_report as a pandas DataFrame if fname is not specified.

    Parameters
    ----------
    acc : str
        Assembly accession (GCA or GCF)
    fname : str, optional
        Save assembly_report to this filename.
    quiet : bool, optional
        Silence warnings.

    Returns
    -------
    pandas.DataFrame
        NCBI assembly report.
    """
    msg = "Could not download the assembly report from NCBI. "
    if not isinstance(acc, str) or not acc.startswith(("GCA", "GCF")):
        if not quiet:
            logger.warning(msg)
        return None
    assembly_report = _assembly_report_url(acc)
    if assembly_report is None:
        if not quiet:
            logger.warning(msg + f"Assembly accession '{acc}' not found.")
        return None
    asm_report = pd.read_csv(assembly_report,
                             sep="\t",
                             comment="#",
                             names=ASM_FORMAT,
                             dtype=str)

    if fname:
        mkdir_p(os.path.dirname(fname))
        asm_report.to_csv(fname, sep="\t", index=False)
    else:
        return asm_report
Esempio n. 21
0
    def after_genome_download(self, genome, threads=1, force=False):
        index_name = genome.plugin["star"]["index_name"]
        if not cmd_ok("STAR") or (os.path.exists(index_name) and not force):
            return

        index_dir = genome.plugin["star"]["index_dir"]
        rmtree(index_dir, ignore_errors=True)
        mkdir_p(index_dir)

        # gunzip genome if bgzipped and return up-to-date genome name
        fname, bgzip = gunzip_and_name(genome.filename)

        # index command
        cmd = (
            f"STAR --runMode genomeGenerate --runThreadN {threads} "
            + f"--genomeFastaFiles {fname} --genomeDir {index_dir} "
            + f"--outFileNamePrefix {index_dir}"
        )

        # if an annotation is present, generate a splice-aware index
        gtf_file = genome.annotation_gtf_file
        gzip_file = False
        if gtf_file:
            # gunzip if gzipped
            gtf_file, gzip_file = gunzip_and_name(gtf_file)

            # update index command with annotation
            cmd += f" --sjdbGTFfile {gtf_file}"
        else:
            print("\nCreating STAR index without annotation file.")

        # Create index
        run_index_cmd("star", cmd)

        # re-bgzip genome if gunzipped
        bgzip_and_name(fname, bgzip)

        # re-gzip annotation if gunzipped
        if gtf_file:
            gzip_and_name(gtf_file, gzip_file)
Esempio n. 22
0
def generate_env(fname: str = "exports.txt", genomes_dir: str = None):
    """
    Generate file with exports.

    By default the export file generated is .config/genomepy/exports.txt.
    An alternative file name or file path is accepted too.

    Parameters
    ----------
    fname: str, optional
        Absolute path or name of the output file.

    genomes_dir: str, optional
        Directory with installed genomes to export.
    """
    fname1 = os.path.expanduser(os.path.expandvars(fname))
    fname2 = os.path.join(user_config_dir("genomepy"), fname)
    fname = fname1 if os.path.isabs(fname1) else fname2
    mkdir_p(os.path.dirname(fname))
    with open(fname, "w") as fout:
        for env in _generate_exports(genomes_dir):
            fout.write(f"{env}\n")
Esempio n. 23
0
def manage_config(cmd):
    """Manage genomepy config file."""
    if cmd == "file":
        print(config.config_file)
    elif cmd == "show":
        with open(config.config_file) as f:
            print(f.read())
    elif cmd == "generate":
        config_dir = user_config_dir("genomepy")
        if not os.path.exists(config_dir):
            mkdir_p(config_dir)

        new_config = os.path.join(config_dir, "genomepy.yaml")
        # existing config must be removed before norns picks up the default again
        if os.path.exists(new_config):
            os.unlink(new_config)
        default_config = norns.config("genomepy",
                                      default="cfg/default.yaml").config_file
        with open(new_config, "w") as fout, open(default_config) as fin:
            fout.write(fin.read())
        config.config_file = new_config
        print(f"Created config file {new_config}")
    else:
        raise ValueError(f"Invalid config command: {cmd}")
Esempio n. 24
0
def download_annotation(genomes_dir, annot_url, localname, n=None):
    """download annotation file, convert to intermediate file and generate output files"""

    # create output directory if missing
    out_dir = os.path.join(genomes_dir, localname)
    mkdir_p(out_dir)

    # download to tmp dir. Move genome on completion.
    # tmp dir is in genome_dir to prevent moving the genome between disks
    tmp_dir = mkdtemp(dir=out_dir)
    ext, is_compressed = get_file_info(annot_url)

    annot_file = os.path.join(tmp_dir, localname + ".annotation" + ext)
    tmp_annot_file = os.path.join(tmp_dir, annot_url.split("/")[-1])
    get_file = shutil.copyfile if os.path.exists(annot_url) else download_file
    if n is None:
        get_file(annot_url, tmp_annot_file)
    else:
        download_head(annot_url, tmp_annot_file, n)
        is_compressed = False

    # unzip input file (if needed)
    if is_compressed:
        annot_file = extract_archive(tmp_annot_file, outfile=annot_file)
    else:
        shutil.move(tmp_annot_file, annot_file)

    # generate intermediate file (GenePred)
    pred_file = annot_file.replace(ext, ".gp")
    if "bed" in ext:
        cmd = "bedToGenePred {0} {1}"
    elif "gff" in ext:
        # example annotation: GRCh38.p12 from NCBI
        cmd = "gff3ToGenePred -useName -warnAndContinue {0} {1}"
    elif "gtf" in ext:
        cmd = "gtfToGenePred -genePredExt -allErrors -ignoreGroupsWithoutExons {0} {1}"
    elif "txt" in ext:
        # UCSC annotations only
        with open(annot_file) as f:
            cols = f.readline().split("\t")

        # extract the genePred format columns
        start_col = 1
        for i, col in enumerate(cols):
            if col in ["+", "-"]:
                start_col = i - 1
                break
        end_col = start_col + 10
        cmd = (
            f"""cat {{0}} | cut -f {start_col}-{end_col} | """
            # knownGene.txt.gz has spotty fields, this replaces non-integer fields with zeroes
            +
            """awk 'BEGIN {{FS=OFS="\t"}} !($11 ~ /^[0-9]+$/) {{$11="0"}}1' > {1}"""
        )
    else:
        raise TypeError(f"file type extension {ext} not recognized!")

    if n is None and "gencode" in annot_url:
        rename_contigs(annot_file)

    sp.check_call(cmd.format(annot_file, pred_file), shell=True)

    # generate gzipped gtf file (if required)
    gtf_file = annot_file.replace(ext, ".gtf")
    if "gtf" not in ext:
        cmd = "genePredToGtf -source=genomepy file {0} {1}"
        sp.check_call(cmd.format(pred_file, gtf_file), shell=True)

    # generate gzipped bed file (if required)
    bed_file = annot_file.replace(ext, ".bed")
    if "bed" not in ext:
        cmd = "genePredToBed {0} {1}"
        sp.check_call(cmd.format(pred_file, bed_file), shell=True)

    # transfer the files from the tmpdir to the genome_dir
    for f in [gtf_file, bed_file]:
        src = f
        dst = os.path.join(out_dir, os.path.basename(f))
        shutil.move(src, dst)
    rm_rf(tmp_dir)
Esempio n. 25
0
    def download_genome(
        self,
        name: str,
        genomes_dir: str = None,
        localname: str = None,
        mask: str = "soft",
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)
        """
        name = self._check_name(name)
        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        mkdir_p(out_dir)

        logger.info(
            f"Downloading genome from {self.name}. Target URL: {link}...")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        get_file = shutil.copyfile if os.path.exists(link) else download_file
        with TemporaryDirectory(dir=out_dir) as tmp_dir:
            tmp_fname = os.path.join(tmp_dir, link.split("/")[-1])
            fname = os.path.join(tmp_dir, f"{localname}.fa")

            get_file(link, tmp_fname)
            logger.info(
                "Genome download successful, starting post processing...")

            # unzip genome
            _, is_compressed = get_file_info(link)
            if is_compressed:
                extract_archive(tmp_fname, outfile=fname, concat=True)
            else:
                shutil.move(tmp_fname, fname)

            # process genome (e.g. masking)
            if hasattr(self, "_post_process_download"):
                self._post_process_download(name=name,
                                            fname=fname,
                                            out_dir=out_dir,
                                            mask=mask)

            # transfer the genome from the tmpdir to the genome_dir
            src = fname
            dst = os.path.join(out_dir, f"{localname}.fa")
            shutil.move(src, dst)

        logger.info("name: {}".format(name))
        logger.info("local name: {}".format(localname))
        logger.info("fasta: {}".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        asm_acc = self.assembly_accession(name)
        tax_id = self.genome_taxid(name)
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession": asm_acc if asm_acc else "na",
            "tax_id": tax_id if tax_id else "na",
            "mask": mask,
            "genome url": link,
            "genomepy version": __version__,
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        update_readme(readme, metadata)
Esempio n. 26
0
def clean():
    """Remove cached data on providers"""
    my_cache_dir = os.path.join(user_cache_dir("genomepy"), __version__)
    rm_rf(my_cache_dir)
    mkdir_p(my_cache_dir)
    print("All clean!")
Esempio n. 27
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        keep_alt=False,
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        keep_alt : bool , optional
            Set to true to keep these alternative regions.

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        with TemporaryDirectory(dir=out_dir) as tmp_dir:
            fname = os.path.join(tmp_dir, f"{localname}.fa")

            # actual download
            urlcleanup()
            with urlopen(link) as response:
                # check available memory vs file size.
                available_memory = int(virtual_memory().available)
                file_size = int(response.info()["Content-Length"])
                # download file in chunks if >75% of memory would be used
                cutoff = int(available_memory * 0.75)
                chunk_size = None if file_size < cutoff else cutoff
                with open(fname, "wb") as f_out:
                    shutil.copyfileobj(response, f_out, chunk_size)
            sys.stderr.write(
                "Genome download successful, starting post processing...\n")

            # unzip genome
            if link.endswith(".tar.gz"):
                tar_to_bigfile(fname, fname)
            elif link.endswith(".gz"):
                os.rename(fname, fname + ".gz")
                ret = sp.check_call(["gunzip", "-f", fname])
                if ret != 0:
                    raise Exception(f"Error gunzipping genome {fname}")

            def regex_filer(_fname, _regex, _v):
                os.rename(_fname, _fname + "_to_regex")
                infa = _fname + "_to_regex"
                outfa = _fname
                filter_fasta(infa, outfa, regex=_regex, v=_v, force=True)

                return [
                    k for k in Fasta(infa).keys()
                    if k not in Fasta(outfa).keys()
                ]

            not_included = []
            # remove alternative regions
            if not keep_alt:
                not_included.extend(regex_filer(fname, "alt", True))

            # keep/remove user defined regions
            if regex:
                not_included.extend(regex_filer(fname, regex, invert_match))

            # process genome (e.g. masking)
            if hasattr(self, "_post_process_download"):
                self._post_process_download(name=name,
                                            localname=localname,
                                            out_dir=tmp_dir,
                                            mask=mask)

            # bgzip genome if requested
            if bgzip or config.get("bgzip"):
                ret = sp.check_call(["bgzip", "-f", fname])
                if ret != 0:
                    raise Exception(
                        f"Error bgzipping {name}. Is tabix installed?")
                fname += ".gz"

            # transfer the genome from the tmpdir to the genome_dir
            src = fname
            dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
            shutil.move(src, dst)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if regex:
            regex_line = f"regex: {regex}"
            if invert_match:
                regex_line += " (inverted match)"
            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)
Esempio n. 28
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        keep_alt=False,
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        keep_alt : bool , optional
            Set to true to keep these alternative regions.

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        tmp_dir = mkdtemp(dir=out_dir)
        fname = os.path.join(tmp_dir, f"{localname}.fa")

        urlcleanup()
        download_file(link, fname)
        sys.stderr.write(
            "Genome download successful, starting post processing...\n")

        # unzip genome
        if link.endswith(".tar.gz"):
            tar_to_bigfile(fname, fname)
        elif link.endswith(".gz"):
            os.rename(fname, fname + ".gz")
            ret = sp.check_call(["gunzip", "-f", fname])
            if ret != 0:
                raise Exception(f"Error gunzipping genome {fname}")

        def regex_filer(_fname, _regex, _v):
            infa = _fname + "_to_regex"
            os.rename(_fname, infa)
            # filter the fasta and store the output's keys
            keys_out = filter_fasta(infa,
                                    outfa=_fname,
                                    regex=_regex,
                                    v=_v,
                                    force=True).keys()
            keys_in = Fasta(infa).keys()
            return [k for k in keys_in if k not in keys_out]

        not_included = []
        # remove alternative regions
        if not keep_alt:
            not_included.extend(regex_filer(fname, "alt", True))

        # keep/remove user defined regions
        if regex:
            not_included.extend(regex_filer(fname, regex, invert_match))

        # process genome (e.g. masking)
        if hasattr(self, "_post_process_download"):
            self._post_process_download(name=name,
                                        localname=localname,
                                        out_dir=tmp_dir,
                                        mask=mask)

        # bgzip genome if requested
        if bgzip or config.get("bgzip"):
            # bgzip to stdout, track progress, and output to file
            fsize = int(os.path.getsize(fname) * 10**-6)
            cmd = (
                f"bgzip -fc {fname} | "
                f"tqdm --bytes --desc Bgzipping {fsize}MB fasta --log ERROR | "
                f"cat > {fname}.gz")
            ret = sp.check_call(cmd, shell=True)
            if ret != 0:
                raise Exception(f"Error bgzipping {name}. Is tabix installed?")
            fname += ".gz"

        # transfer the genome from the tmpdir to the genome_dir
        src = fname
        dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
        shutil.move(src, dst)
        rm_rf(tmp_dir)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if not keep_alt or regex:
            regex_line = "regex: "
            if not keep_alt:
                regex_line += "'alt' (inverted match)"

            if not keep_alt and regex:
                regex_line += " and "

            if regex:
                regex_line += f"'{regex}'"
                if invert_match:
                    regex_line += " (inverted match)"

            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)
Esempio n. 29
0
    def download_and_generate_annotation(genomes_dir, annot_url, localname):
        """download annotation file, convert to intermediate file and generate output files"""

        # create output directory if missing
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        # download to tmp dir. Move files on completion.
        with TemporaryDirectory(dir=out_dir) as tmpdir:
            ext, gz = get_file_info(annot_url)
            annot_file = os.path.join(tmpdir, localname + ".annotation" + ext)
            urlretrieve(annot_url, annot_file)

            # unzip input file (if needed)
            if gz:
                cmd = "mv {0} {1} && gunzip -f {1}"
                sp.check_call(cmd.format(annot_file, annot_file + ".gz"),
                              shell=True)

            # generate intermediate file (GenePred)
            pred_file = annot_file.replace(ext, ".gp")
            if "bed" in ext:
                cmd = "bedToGenePred {0} {1}"
            elif "gff" in ext:
                cmd = "gff3ToGenePred -geneNameAttr=gene {0} {1}"
            elif "gtf" in ext:
                cmd = "gtfToGenePred {0} {1}"
            elif "txt" in ext:
                # UCSC annotations only
                with open(annot_file) as f:
                    cols = f.readline().split("\t")

                # extract the genePred format columns
                start_col = 1
                for i, col in enumerate(cols):
                    if col in ["+", "-"]:
                        start_col = i - 1
                        break
                end_col = start_col + 10
                cmd = (
                    f"""cat {{0}} | cut -f {start_col}-{end_col} | """
                    # knownGene.txt.gz has spotty fields, this replaces non-integer fields with zeroes
                    +
                    """awk 'BEGIN {{FS=OFS="\t"}} !($11 ~ /^[0-9]+$/) {{$11="0"}}1' > {1}"""
                )
            else:
                raise TypeError(f"file type extension {ext} not recognized!")

            sp.check_call(cmd.format(annot_file, pred_file), shell=True)

            # generate gzipped gtf file (if required)
            gtf_file = annot_file.replace(ext, ".gtf")
            if "gtf" not in ext:
                cmd = "genePredToGtf -source=genomepy file {0} {1} && gzip -f {1}"
                sp.check_call(cmd.format(pred_file, gtf_file), shell=True)

            # generate gzipped bed file (if required)
            bed_file = annot_file.replace(ext, ".bed")
            if "bed" not in ext:
                cmd = "genePredToBed {0} {1} && gzip -f {1}"
                sp.check_call(cmd.format(pred_file, bed_file), shell=True)

            # if input file was gtf/bed, gzip it
            if ext in [".gtf", ".bed"]:
                cmd = "gzip -f {}"
                sp.check_call(cmd.format(annot_file), shell=True)

            # transfer the files from the tmpdir to the genome_dir
            for f in [gtf_file + ".gz", bed_file + ".gz"]:
                src = f
                dst = os.path.join(out_dir, os.path.basename(f))
                shutil.move(src, dst)