Esempio n. 1
0
def _index_sam(env, ref_file):
    (ref_dir, local_file) = os.path.split(ref_file)
    with shared.chdir(ref_dir):
        if not os.path.exists("%s.fai" % local_file):
            subprocess.check_call("samtools faidx %s" % local_file, shell=True)
    galaxy.index_picard(ref_file)
    return ref_file
Esempio n. 2
0
def _data_uniref(env):
    """Retrieve and index UniRef databases for protein searches.

    http://www.ebi.ac.uk/uniref/

    These are currently indexed for FASTA searches. Are other indexes desired?
    Should this be separated out and organized by program like genome data?
    This should also check the release note and automatically download and
    replace older versions.
    """
    site = "ftp://ftp.uniprot.org"
    base_url = site + "/pub/databases/uniprot/" \
               "current_release/uniref/%s/%s"
    for uniref_db in ["uniref50", "uniref90", "uniref100"]:
        work_dir = os.path.join(env.data_files, "uniref", uniref_db)
        if not os.path.exists(work_dir):
            subprocess.check_call("mkdir -p %s" % work_dir, shell=True)
        base_work_url = base_url % (uniref_db, uniref_db)
        fasta_url = base_work_url + ".fasta.gz"
        base_file = os.path.splitext(os.path.basename(fasta_url))[0]
        with shared.chdir(work_dir):
            if not os.path.exists(base_file):
                out_file = shared._remote_fetch(env, fasta_url)
                subprocess.check_call("gunzip %s" % out_file, shell=True)
                shared._remote_fetch(env, base_work_url + ".release_note")
        _index_blast_db(work_dir, base_file, "prot")
Esempio n. 3
0
 def download(self, seq_dir):
     zipped_file = None
     genome_file = "%s.fa" % self._name
     if not self._exists(genome_file, seq_dir):
         prep_dir = "seq_prep"
         subprocess.check_call("mkdir -p %s" % prep_dir, shell=True)
         with shared.chdir(prep_dir):
             zipped_file = self._download_zip(seq_dir)
             if zipped_file.endswith(".tar.gz"):
                 subprocess.check_call("tar -xzpf %s" % zipped_file, shell=True)
             elif zipped_file.endswith(".zip"):
                 subprocess.check_call("unzip %s" % zipped_file, shell=True)
             elif zipped_file.endswith(".gz"):
                 if not os.path.exists("out.fa"):
                     subprocess.check_call("gunzip -c %s > out.fa" % zipped_file, shell=True)
             else:
                 raise ValueError("Do not know how to handle: %s" % zipped_file)
             tmp_file = genome_file.replace(".fa", ".txt")
             result = subprocess.check_output("find `pwd` -name '*.fa'", shell=True).decode()
             result = [x.strip() for x in result.split("\n")]
             if len(result) == 1:
                 orig_result = result[0]
                 result = self._split_multifasta(result[0])
                 subprocess.check_call("rm %s" % orig_result, shell=True)
             result = self._karyotype_sort(result)
             subprocess.check_call("rm -f inputs.txt", shell=True)
             for fname in result:
                 subprocess.check_output("echo '%s' >> inputs.txt" % fname, shell=True).decode()
             subprocess.check_call("cat `cat inputs.txt` > %s" % (tmp_file), shell=True)
             for fname in result:
                 subprocess.check_output("rm -f %s" % fname, shell=True).decode()
             subprocess.check_call("mv %s %s" % (tmp_file, genome_file), shell=True)
             zipped_file = os.path.join(prep_dir, zipped_file)
             genome_file = os.path.join(prep_dir, genome_file)
     return genome_file, [zipped_file]
Esempio n. 4
0
def _index_sam(env, ref_file):
    (ref_dir, local_file) = os.path.split(ref_file)
    with shared.chdir(ref_dir):
        if not os.path.exists("%s.fai" % local_file):
            subprocess.check_call("samtools faidx %s" % local_file, shell=True)
    galaxy.index_picard(ref_file)
    return ref_file
Esempio n. 5
0
 def download(self, seq_dir):
     zipped_file = None
     genome_file = "%s.fa" % self._name
     if not self._exists(genome_file, seq_dir):
         prep_dir = "seq_prep"
         subprocess.check_call("mkdir -p %s" % prep_dir, shell=True)
         with shared.chdir(prep_dir):
             zipped_file = self._download_zip(seq_dir)
             if zipped_file.endswith(".tar.gz"):
                 subprocess.check_call("tar -xzpf %s" % zipped_file, shell=True)
             elif zipped_file.endswith(".zip"):
                 subprocess.check_call("unzip %s" % zipped_file, shell=True)
             elif zipped_file.endswith(".gz"):
                 if not os.path.exists("out.fa"):
                     subprocess.check_call("gunzip -c %s > out.fa" % zipped_file, shell=True)
             else:
                 raise ValueError("Do not know how to handle: %s" % zipped_file)
             tmp_file = genome_file.replace(".fa", ".txt")
             result = subprocess.check_output("find `pwd` -name '*.fa'", shell=True).decode()
             result = [x.strip() for x in result.split("\n")]
             if len(result) == 1:
                 orig_result = result[0]
                 result = self._split_multifasta(result[0])
                 subprocess.check_call("rm %s" % orig_result, shell=True)
             result = self._karyotype_sort(result)
             subprocess.check_call("rm -f inputs.txt", shell=True)
             for fname in result:
                 subprocess.check_output("echo '%s' >> inputs.txt" % fname, shell=True).decode()
             subprocess.check_call("cat `cat inputs.txt` > %s" % (tmp_file), shell=True)
             for fname in result:
                 subprocess.check_output("rm -f %s" % fname, shell=True).decode()
             subprocess.check_call("mv %s %s" % (tmp_file, genome_file), shell=True)
             zipped_file = os.path.join(prep_dir, zipped_file)
             genome_file = os.path.join(prep_dir, genome_file)
     return genome_file, [zipped_file]
Esempio n. 6
0
def _data_liftover(env, lift_over_genomes):
    """Download chain files for running liftOver.

    Does not install liftOver binaries automatically.
    """
    lo_dir = os.path.join(env.data_files, "liftOver")
    if not os.path.exists(lo_dir):
        subprocess.check_call("mkdir %s" % lo_dir, shell=True)
    lo_base_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath/%s/liftOver/%s"
    lo_base_file = "%sTo%s.over.chain.gz"
    for g1 in lift_over_genomes:
        for g2 in [g for g in lift_over_genomes if g != g1]:
            g2u = g2[0].upper() + g2[1:]
            cur_file = lo_base_file % (g1, g2u)
            non_zip = os.path.splitext(cur_file)[0]
            worked = False
            with shared.chdir(lo_dir):
                if not os.path.exists(non_zip):
                    result = shared._remote_fetch(env, "%s" % (lo_base_url % (g1, cur_file)), allow_fail=True)
                    # Lift over back and forths don't always exist
                    # Only move forward if we found the file
                    if result:
                        worked = True
                        subprocess.check_call("gunzip %s" % result, shell=True)
            if worked:
                ref_parts = [g1, g2, os.path.join(lo_dir, non_zip)]
                galaxy.update_loc_file(env, "liftOver.loc", ref_parts)
Esempio n. 7
0
def _data_uniref(env):
    """Retrieve and index UniRef databases for protein searches.

    http://www.ebi.ac.uk/uniref/

    These are currently indexed for FASTA searches. Are other indexes desired?
    Should this be separated out and organized by program like genome data?
    This should also check the release note and automatically download and
    replace older versions.
    """
    site = "ftp://ftp.uniprot.org"
    base_url = site + "/pub/databases/uniprot/" \
               "current_release/uniref/%s/%s"
    for uniref_db in ["uniref50", "uniref90", "uniref100"]:
        work_dir = os.path.join(env.data_files, "uniref", uniref_db)
        if not os.path.exists(work_dir):
            subprocess.check_call("mkdir -p %s" % work_dir, shell=True)
        base_work_url = base_url % (uniref_db, uniref_db)
        fasta_url = base_work_url + ".fasta.gz"
        base_file = os.path.splitext(os.path.basename(fasta_url))[0]
        with shared.chdir(work_dir):
            if not os.path.exists(base_file):
                out_file = shared._remote_fetch(env, fasta_url)
                subprocess.check_call("gunzip %s" % out_file, shell=True)
                shared._remote_fetch(env, base_work_url + ".release_note")
        _index_blast_db(work_dir, base_file, "prot")
Esempio n. 8
0
def _data_liftover(env, lift_over_genomes):
    """Download chain files for running liftOver.

    Does not install liftOver binaries automatically.
    """
    lo_dir = os.path.join(env.data_files, "liftOver")
    if not os.path.exists(lo_dir):
        subprocess.check_call("mkdir %s" % lo_dir, shell=True)
    lo_base_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath/%s/liftOver/%s"
    lo_base_file = "%sTo%s.over.chain.gz"
    for g1 in lift_over_genomes:
        for g2 in [g for g in lift_over_genomes if g != g1]:
            g2u = g2[0].upper() + g2[1:]
            cur_file = lo_base_file % (g1, g2u)
            non_zip = os.path.splitext(cur_file)[0]
            worked = False
            with shared.chdir(lo_dir):
                if not os.path.exists(non_zip):
                    result = shared._remote_fetch(env, "%s" % (lo_base_url % (g1, cur_file)), allow_fail=True)
                    # Lift over back and forths don't always exist
                    # Only move forward if we found the file
                    if result:
                        worked = True
                        subprocess.check_call("gunzip %s" % result, shell=True)
            if worked:
                ref_parts = [g1, g2, os.path.join(lo_dir, non_zip)]
                galaxy.update_loc_file(env, "liftOver.loc", ref_parts)
Esempio n. 9
0
def _index_w_command(env,
                     dir_name,
                     command,
                     ref_file,
                     pre=None,
                     post=None,
                     ext=None):
    """Low level function to do the indexing and paths with an index command.
    """
    path_export = _get_path_export(env)
    index_name = os.path.splitext(os.path.basename(ref_file))[0]
    if ext is not None: index_name += ext
    full_ref_path = os.path.join(os.pardir, ref_file)
    if not os.path.exists(dir_name):
        subprocess.check_call("mkdir %s" % dir_name, shell=True)
        with shared.chdir(dir_name):
            if pre:
                full_ref_path = pre(full_ref_path)
            subprocess.check_call(
                path_export +
                command.format(ref_file=full_ref_path, index_name=index_name),
                shell=True)
            if post:
                post(full_ref_path)
    return os.path.join(dir_name, index_name)
Esempio n. 10
0
def _tar_directory(dir, tar_name):
    """Create a tarball of the directory.
    """
    base_dir, tar_dir = os.path.split(dir)
    tarball = os.path.join(base_dir, "%s.tar.xz" % tar_name)
    if not os.path.exists(tarball):
        with shared.chdir(base_dir):
            subprocess.check_call("tar -cvpf - %s | xz -zc - > %s" %
                                  (tar_dir, os.path.basename(tarball)), shell=True)
    return tarball
Esempio n. 11
0
def _index_to_galaxy(env, work_dir, ref_file, gid, genome_indexes, config):
    """Index sequence files and update associated Galaxy loc files.
    """
    indexes = {}
    with shared.chdir(work_dir):
        for idx in genome_indexes:
            index_file = get_index_fn(idx)(env, ref_file)
            if index_file:
                indexes[idx] = os.path.join(work_dir, index_file)
    galaxy.prep_locs(env, gid, indexes, config)
Esempio n. 12
0
def _index_bismark(env, ref_file):
    dir_name = "bismark"
    subprocess.check_call("mkdir -p %s" % dir_name, shell=True)
    with shared.chdir(dir_name):
        local = os.path.basename(ref_file)
        subprocess.check_call("ln -sf {0} {1}".format(ref_file, local),
                              shell=True)
        cmd = f"bismark_genome_preparation ."
        subprocess.check_call(cmd, shell=True)
    return os.path.join(dir_name, "Bisulfite_Genome")
Esempio n. 13
0
def _index_blast_db(work_dir, base_file, db_type):
    """Index a database using blast+ for similary searching.
    """
    type_to_ext = dict(prot=("phr", "pal"), nucl=("nhr", "nal"))
    db_name = os.path.splitext(base_file)[0]
    with shared.chdir(work_dir):
        if not reduce(operator.or_,
                      (os.path.exists("%s.%s" % (db_name, ext)) for ext in type_to_ext[db_type])):
            subprocess.check_call("makeblastdb -in %s -dbtype %s -out %s" %
                                  (base_file, db_type, db_name), shell=True)
Esempio n. 14
0
def _index_to_galaxy(env, work_dir, ref_file, gid, genome_indexes, config):
    """Index sequence files and update associated Galaxy loc files.
    """
    indexes = {}
    with shared.chdir(work_dir):
        for idx in genome_indexes:
            index_file = get_index_fn(idx)(env, ref_file)
            if index_file:
                indexes[idx] = os.path.join(work_dir, index_file)
    galaxy.prep_locs(env, gid, indexes, config)
Esempio n. 15
0
def _index_blast_db(work_dir, base_file, db_type):
    """Index a database using blast+ for similary searching.
    """
    type_to_ext = dict(prot=("phr", "pal"), nucl=("nhr", "nal"))
    db_name = os.path.splitext(base_file)[0]
    with shared.chdir(work_dir):
        if not reduce(operator.or_,
                      (os.path.exists("%s.%s" % (db_name, ext)) for ext in type_to_ext[db_type])):
            subprocess.check_call("makeblastdb -in %s -dbtype %s -out %s" %
                                  (base_file, db_type, db_name), shell=True)
Esempio n. 16
0
def _tar_directory(dir, tar_name):
    """Create a tarball of the directory.
    """
    base_dir, tar_dir = os.path.split(dir)
    tarball = os.path.join(base_dir, "%s.tar.xz" % tar_name)
    if not os.path.exists(tarball):
        with shared.chdir(base_dir):
            subprocess.check_call("tar -cvpf - %s | xz -zc - > %s" %
                                  (tar_dir, os.path.basename(tarball)), shell=True)
    return tarball
Esempio n. 17
0
def _index_bwa(env, ref_file):
    dir_name = "bwa"
    local_ref = os.path.split(ref_file)[-1]
    if not os.path.exists(os.path.join(dir_name, "%s.bwt" % local_ref)):
        subprocess.check_call("mkdir -p %s" % dir_name, shell=True)
        with shared.chdir(dir_name):
            subprocess.check_call("ln -sf %s" % os.path.join(os.pardir, ref_file), shell=True)
            try:
                subprocess.check_call("bwa index -a bwtsw %s" % local_ref, shell=True)
            except subprocess.CalledProcessError:
                # work around a bug in bwa indexing for small files
                subprocess.check_call("bwa index %s" % local_ref, shell=True)
            subprocess.check_call("rm -f %s" % local_ref, shell=True)
    return os.path.join(dir_name, local_ref)
Esempio n. 18
0
def _index_bwa(env, ref_file):
    dir_name = "bwa"
    local_ref = os.path.split(ref_file)[-1]
    if not os.path.exists(os.path.join(dir_name, "%s.bwt" % local_ref)):
        subprocess.check_call("mkdir -p %s" % dir_name, shell=True)
        with shared.chdir(dir_name):
            subprocess.check_call("ln -sf %s" % os.path.join(os.pardir, ref_file), shell=True)
            try:
                subprocess.check_call("bwa index -a bwtsw %s" % local_ref, shell=True)
            except subprocess.CalledProcessError:
                # work around a bug in bwa indexing for small files
                subprocess.check_call("bwa index %s" % local_ref, shell=True)
            subprocess.check_call("rm -f %s" % local_ref, shell=True)
    return os.path.join(dir_name, local_ref)
Esempio n. 19
0
def finalize(genomes, data_filedir):
    """Provide symlinks back to reference genomes so tophat avoids generating FASTA genomes.
    """
    genome_dir = os.path.join(data_filedir, "genomes")
    for (orgname, gid, manager) in genomes:
        org_dir = os.path.join(genome_dir, orgname)
        for aligner in ["bowtie", "bowtie2"]:
            aligner_dir = os.path.join(org_dir, gid, aligner)
            if os.path.exists(aligner_dir):
                with shared.chdir(aligner_dir):
                    for ext in ["", ".fai"]:
                        orig_seq = os.path.join(os.pardir, "seq", "%s.fa%s" % (gid, ext))
                        if os.path.exists(orig_seq) and not os.path.exists(os.path.basename(orig_seq)):
                            subprocess.check_call("ln -sf %s" % orig_seq, shell=True)
Esempio n. 20
0
def _clean_directory(dir, gid):
    """Clean duplicate files from directories before tar and upload.
    """
    # get rid of softlinks
    bowtie_ln = os.path.join(dir, "bowtie", "%s.fa" % gid)
    maq_ln = os.path.join(dir, "maq", "%s.fa" % gid)
    for to_remove in [bowtie_ln, maq_ln]:
        if os.path.exists(to_remove):
            subprocess.check_call("rm -f %s" % to_remove, shell=True)
    # remove any downloaded original sequence files
    remove_exts = ["*.gz", "*.zip"]
    with shared.chdir(os.path.join(dir, "seq")):
        for rext in remove_exts:
            fnames = subprocess.check_output("find . -name '%s'" % rext, shell=True).decode()
            for fname in (f.strip() for f in fnames.split("\n") if f.strip()):
                subprocess.check_call("rm -f %s" % fname, shell=True)
Esempio n. 21
0
def _clean_directory(dir, gid):
    """Clean duplicate files from directories before tar and upload.
    """
    # get rid of softlinks
    bowtie_ln = os.path.join(dir, "bowtie", "%s.fa" % gid)
    maq_ln = os.path.join(dir, "maq", "%s.fa" % gid)
    for to_remove in [bowtie_ln, maq_ln]:
        if os.path.exists(to_remove):
            subprocess.check_call("rm -f %s" % to_remove, shell=True)
    # remove any downloaded original sequence files
    remove_exts = ["*.gz", "*.zip"]
    with shared.chdir(os.path.join(dir, "seq")):
        for rext in remove_exts:
            fnames = subprocess.check_output("find . -name '%s'" % rext, shell=True).decode()
            for fname in (f.strip() for f in fnames.split("\n") if f.strip()):
                subprocess.check_call("rm -f %s" % fname, shell=True)
Esempio n. 22
0
def _data_ngs_genomes(env, genomes, genome_indexes):
    """Download and create index files for next generation genomes.
    """
    genome_dir = _make_genome_dir(env.data_files)
    for organism, genome, manager in genomes:
        cur_dir = os.path.join(genome_dir, organism, genome)
        print("Processing genome {0} and putting it to {1}".format(organism, cur_dir))
        if not os.path.exists(cur_dir):
            subprocess.check_call('mkdir -p %s' % cur_dir, shell=True)
        with shared.chdir(cur_dir):
            if hasattr(env, "remove_old_genomes") and env.remove_old_genomes:
                _clean_genome_directory()
            seq_dir = 'seq'
            ref_file, base_zips = manager.download(seq_dir)
            ref_file = _move_seq_files(ref_file, base_zips, seq_dir)
        cur_indexes = manager.config.get("indexes", genome_indexes)
        _index_to_galaxy(env, cur_dir, ref_file, genome, cur_indexes, manager.config)
Esempio n. 23
0
def _index_w_command(env, dir_name, command, ref_file, pre=None, post=None, ext=None):
    """Low level function to do the indexing and paths with an index command.
    """
    path_export = _get_path_export(env)
    index_name = os.path.splitext(os.path.basename(ref_file))[0]
    if ext is not None: index_name += ext
    full_ref_path = os.path.join(os.pardir, ref_file)
    if not os.path.exists(dir_name):
        subprocess.check_call("mkdir %s" % dir_name, shell=True)
        with shared.chdir(dir_name):
            if pre:
                full_ref_path = pre(full_ref_path)
            subprocess.check_call(path_export + command.format(ref_file=full_ref_path, index_name=index_name),
                                  shell=True)
            if post:
                post(full_ref_path)
    return os.path.join(dir_name, index_name)
Esempio n. 24
0
def finalize(genomes, data_filedir):
    """Provide symlinks back to reference genomes so tophat avoids generating FASTA genomes.
    """
    genome_dir = os.path.join(data_filedir, "genomes")
    for (orgname, gid, manager) in genomes:
        org_dir = os.path.join(genome_dir, orgname)
        for aligner in ["bowtie", "bowtie2"]:
            aligner_dir = os.path.join(org_dir, gid, aligner)
            if os.path.exists(aligner_dir):
                with shared.chdir(aligner_dir):
                    for ext in ["", ".fai"]:
                        orig_seq = os.path.join(os.pardir, "seq",
                                                "%s.fa%s" % (gid, ext))
                        if os.path.exists(orig_seq) and not os.path.exists(
                                os.path.basename(orig_seq)):
                            subprocess.check_call("ln -sf %s" % orig_seq,
                                                  shell=True)
Esempio n. 25
0
def _data_ngs_genomes(env, genomes, genome_indexes):
    """Download and create index files for next generation genomes.
    """
    genome_dir = _make_genome_dir(env.data_files)
    for organism, genome, manager in genomes:
        cur_dir = os.path.join(genome_dir, organism, genome)
        print("Processing genome {0} and putting it to {1}".format(organism, cur_dir))
        if not os.path.exists(cur_dir):
            subprocess.check_call('mkdir -p %s' % cur_dir, shell=True)
        with shared.chdir(cur_dir):
            if hasattr(env, "remove_old_genomes") and env.remove_old_genomes:
                _clean_genome_directory()
            seq_dir = 'seq'
            ref_file, base_zips = manager.download(seq_dir)
            ref_file = _move_seq_files(ref_file, base_zips, seq_dir)
        cur_indexes = manager.config.get("indexes", genome_indexes)
        _index_to_galaxy(env, cur_dir, ref_file, genome, cur_indexes, manager.config)
Esempio n. 26
0
def _download_genomes(env, genomes, genome_indexes):
    """Download a group of genomes from Amazon s3 bucket.
    """
    genome_dir = _make_genome_dir(env.data_files)
    for (orgname, gid, manager) in genomes:
        org_dir = os.path.join(genome_dir, orgname, gid)
        if not os.path.exists(org_dir):
            subprocess.check_call('mkdir -p %s' % org_dir, shell=True)
        for idx in genome_indexes:
            with shared.chdir(org_dir):
                if not os.path.exists(idx):
                    _download_s3_index(env, manager, gid, idx)
        ref_file = os.path.join(org_dir, "seq", "%s.fa" % gid)
        if not os.path.exists(ref_file):
            ref_file = os.path.join(org_dir, "seq", "%s.fa" % manager._name)
        assert os.path.exists(ref_file), ref_file
        cur_indexes = manager.config.get("indexes", genome_indexes)
        _index_to_galaxy(env, org_dir, ref_file, gid, cur_indexes, manager.config)
Esempio n. 27
0
def _download_genomes(env, genomes, genome_indexes):
    """Download a group of genomes from Amazon s3 bucket.
    """
    genome_dir = _make_genome_dir(env.data_files)
    for (orgname, gid, manager) in genomes:
        org_dir = os.path.join(genome_dir, orgname, gid)
        if not os.path.exists(org_dir):
            subprocess.check_call('mkdir -p %s' % org_dir, shell=True)
        for idx in genome_indexes:
            with shared.chdir(org_dir):
                if not os.path.exists(idx):
                    _download_s3_index(env, manager, gid, idx)
        ref_file = os.path.join(org_dir, "seq", "%s.fa" % gid)
        if not os.path.exists(ref_file):
            ref_file = os.path.join(org_dir, "seq", "%s.fa" % manager._name)
        assert os.path.exists(ref_file), ref_file
        cur_indexes = manager.config.get("indexes", genome_indexes)
        _index_to_galaxy(env, org_dir, ref_file, gid, cur_indexes, manager.config)
Esempio n. 28
0
def _prep_genomes(env, genomes, genome_indexes, retrieve_fns, data_filedir):
    """Prepare genomes with the given indexes, supporting multiple retrieval methods.
    """
    genome_dir = _make_genome_dir(data_filedir)
    for (orgname, gid, manager) in genomes:
        org_dir = os.path.join(genome_dir, orgname, gid)
        if not os.path.exists(org_dir):
            subprocess.check_call('mkdir -p %s' % org_dir, shell=True)
        ggd_recipes = manager.config.get(
            "annotations", []) + manager.config.get("validation", [])
        ggd_recipes += [
            x for x in manager.config.get("indexes", []) if x in genome_indexes
        ]
        for idx in genome_indexes + ggd_recipes:
            with shared.chdir(org_dir):
                if idx in ggd_recipes or not os.path.exists(idx):
                    finished = False
                    last_exc = None
                    for method, retrieve_fn in retrieve_fns:
                        try:
                            retrieve_fn(env, manager, gid, idx)
                            finished = True
                            break
                        except KeyboardInterrupt:
                            raise
                        except BaseException as e:
                            # Fail on incorrect GGD recipes
                            if idx in ggd_recipes and method == "ggd":
                                raise
                            else:
                                last_exc = traceback.format_exc()
                                print(
                                    "Moving on to next genome prep method after trying {0}\n{1}"
                                    .format(method, str(e)))
                    if not finished:
                        raise IOError(
                            "Could not prepare index {0} for {1} by any method\n{2}"
                            .format(idx, gid, last_exc))
        ref_file = os.path.join(org_dir, "seq", "%s.fa" % gid)
        if not os.path.exists(ref_file):
            ref_file = os.path.join(org_dir, "seq", "%s.fa" % manager._name)
        assert os.path.exists(ref_file), ref_file
        _index_to_galaxy(env, org_dir, ref_file, gid, genome_indexes,
                         manager.config)
Esempio n. 29
0
def update_loc_file(env, ref_file, line_parts):
    """Add a reference to the given genome to the base index file.
    """
    if getattr(env, "galaxy_home", None) is not None:
        tools_dir = os.path.join(env.galaxy_home, "tool-data")
        if not os.path.exists(tools_dir):
            subprocess.check_call("mkdir -p %s" % tools_dir, shell=True)
        dt_file = os.path.join(env.galaxy_home, "tool_data_table_conf.xml")
        if not os.path.exists(dt_file):
            shutil.copy(env.tool_data_table_conf_file, dt_file)
        add_str = "\t".join(line_parts)
        with shared.chdir(tools_dir):
            if not os.path.exists(ref_file):
                subprocess.check_call("touch %s" % ref_file, shell=True)
            has_line = False
            with open(ref_file) as in_handle:
                for line in in_handle:
                    if line.strip() == add_str.strip():
                        has_line = True
            if not has_line:
                with open(ref_file, "a") as out_handle:
                    out_handle.write(add_str + "\n")
Esempio n. 30
0
def _prep_genomes(env, genomes, genome_indexes, retrieve_fns, data_filedir):
    """Prepare genomes with the given indexes, supporting multiple retrieval methods.
    """
    genome_dir = _make_genome_dir(data_filedir)
    for (orgname, gid, manager) in genomes:
        org_dir = os.path.join(genome_dir, orgname, gid)
        if not os.path.exists(org_dir):
            subprocess.check_call('mkdir -p %s' % org_dir, shell=True)
        ggd_recipes = manager.config.get("annotations", []) + manager.config.get("validation", [])
        ggd_recipes += [x for x in manager.config.get("indexes", []) if x in genome_indexes]
        for idx in genome_indexes + ggd_recipes:
            with shared.chdir(org_dir):
                if idx in ggd_recipes or not os.path.exists(idx):
                    finished = False
                    last_exc = None
                    for method, retrieve_fn in retrieve_fns:
                        try:
                            retrieve_fn(env, manager, gid, idx)
                            finished = True
                            break
                        except KeyboardInterrupt:
                            raise
                        except BaseException as e:
                            # Fail on incorrect GGD recipes
                            if idx in ggd_recipes and method == "ggd":
                                raise
                            else:
                                last_exc = traceback.format_exc()
                                print("Moving on to next genome prep method after trying {0}\n{1}".format(
                                      method, str(e)))
                    if not finished:
                        raise IOError("Could not prepare index {0} for {1} by any method\n{2}"
                                      .format(idx, gid, last_exc))
        ref_file = os.path.join(org_dir, "seq", "%s.fa" % gid)
        if not os.path.exists(ref_file):
            ref_file = os.path.join(org_dir, "seq", "%s.fa" % manager._name)
        assert os.path.exists(ref_file), ref_file
        _index_to_galaxy(env, org_dir, ref_file, gid, genome_indexes, manager.config)