def download_pfam_descriptions(output_dir='.', verbose=True): pfam_hmm_dat = path.join(output_dir, 'Pfam-A.hmm.dat.gz') download_file( 'ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz', pfam_hmm_dat, verbose=verbose) return pfam_hmm_dat
def download_and_process_viral_refseq(merged_viral_faas=None, output_dir='.', viral_files=2, threads=10, verbose=True): """Can only download newest version""" # download all of the viral protein files, need to know the number of files # TODO: Make it so that you don't need to know number of viral files in refseq viral if merged_viral_faas is None: # download database if not provided faa_base_name = 'viral.%s.protein.faa.gz' viral_faa_glob = path.join(output_dir, faa_base_name % '*') for number in range(viral_files): number += 1 refseq_url = 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.%s.protein.faa.gz' % number refseq_faa = path.join(output_dir, faa_base_name % number) download_file(refseq_url, refseq_faa, verbose=verbose) # then merge files from above merged_viral_faas = path.join(output_dir, 'viral.merged.protein.faa.gz') run_process([ 'cat %s > %s' % (' '.join(glob(viral_faa_glob)), merged_viral_faas) ], shell=True) # make mmseqs database refseq_viral_mmseqs_db = path.join( output_dir, 'refseq_viral.%s.mmsdb' % get_iso_date()) make_mmseqs_db(merged_viral_faas, refseq_viral_mmseqs_db, create_index=True, threads=threads, verbose=verbose) return refseq_viral_mmseqs_db
def download_vog_annotations(output_dir, vogdb_version='latest', verbose=True): vog_annotations = path.join(output_dir, 'vog_annotations_%s.tsv.gz' % vogdb_version) download_file( 'http://fileshare.csb.univie.ac.at/vog/%s/vog.annotations.tsv.gz' % vogdb_version, vog_annotations, verbose=verbose) return vog_annotations
def download_and_process_amg_database(output_dir, branch='master', verbose=True): amg_database = path.join(output_dir, 'amg_database.%s.tsv' % get_iso_date()) download_file( 'https://raw.githubusercontent.com/shafferm/DRAM/%s/data/amg_database.tsv' % branch, amg_database, verbose=verbose) return amg_database
def download_and_process_function_heatmap_form(output_dir, branch='master', verbose=True): function_heatmap_form = path.join( output_dir, 'function_heatmap_form.%s.tsv' % get_iso_date()) download_file( 'https://raw.githubusercontent.com/shafferm/DRAM/%s/data/function_heatmap_form.tsv' % branch, function_heatmap_form, verbose=verbose) return function_heatmap_form
def download_and_process_genome_summary_form(output_dir, branch='master', verbose=True): genome_summary_form = path.join( output_dir, 'genome_summary_form.%s.tsv' % get_iso_date()) download_file( 'https://raw.githubusercontent.com/shafferm/DRAM/%s/data/genome_summary_form.tsv' % branch, genome_summary_form, verbose=verbose) return genome_summary_form
def download_dbcan_descriptions(output_dir='.', upload_date='07302020', verbose=True): dbcan_fam_activities = path.join( output_dir, 'CAZyDB.%s.fam-activities.txt' % upload_date) download_file( 'http://bcb.unl.edu/dbCAN2/download/Databases/CAZyDB.%s.fam-activities.txt' % upload_date, dbcan_fam_activities, verbose=verbose) return dbcan_fam_activities
def download_and_process_kofam_ko_list(kofam_ko_list_gz=None, output_dir='.', verbose=False): if kofam_ko_list_gz is None: kofam_ko_list_gz = path.join(output_dir, 'kofam_ko_list.tsv.gz') download_file('ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz', kofam_ko_list_gz, verbose=verbose) # TODO: fix this so that it is gunzipped to the path kofam_ko_list = path.join(output_dir, 'kofam_ko_list.tsv') run_process(['gunzip', kofam_ko_list_gz], verbose=verbose) return kofam_ko_list
def download_and_process_pfam(pfam_full_zipped=None, output_dir='.', threads=10, verbose=True): if pfam_full_zipped is None: # download database if not provided pfam_full_zipped = path.join(output_dir, 'Pfam-A.full.gz') download_file( 'ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz', pfam_full_zipped, verbose=verbose) pfam_profile = process_mmspro(pfam_full_zipped, output_dir, 'pfam', threads, verbose) return pfam_profile
def download_and_process_dbcan(dbcan_hmm=None, output_dir='.', dbcan_release='8', verbose=True): if dbcan_hmm is None: # download database if not provided dbcan_hmm = path.join(output_dir, 'dbCAN-HMMdb-V%s.txt' % dbcan_release) download_file( 'http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-HMMdb-V%s.txt' % dbcan_release, dbcan_hmm, verbose=verbose) run_process(['hmmpress', '-f', dbcan_hmm], verbose=verbose) return dbcan_hmm
def download_and_process_vogdb(vog_hmm_targz=None, output_dir='.', vogdb_release='latest', verbose=True): if vog_hmm_targz is None: vog_hmm_targz = path.join(output_dir, 'vog.hmm.tar.gz') vogdb_url = 'http://fileshare.csb.univie.ac.at/vog/%s/vog.hmm.tar.gz' % vogdb_release download_file(vogdb_url, vog_hmm_targz, verbose=verbose) hmm_dir = path.join(output_dir, 'vogdb_hmms') mkdir(hmm_dir) vogdb_targz = tarfile.open(vog_hmm_targz) vogdb_targz.extractall(hmm_dir) vog_hmms = path.join(output_dir, 'vog_%s_hmms.txt' % vogdb_release) merge_files(glob(path.join(hmm_dir, 'VOG*.hmm')), vog_hmms) run_process(['hmmpress', '-f', vog_hmms], verbose=verbose) return vog_hmms
def download_and_process_merops_peptidases(peptidase_faa=None, output_dir='.', threads=10, verbose=True): if peptidase_faa is None: # download database if not provided peptidase_faa = path.join(output_dir, 'merops_peptidases_nr.faa') merops_url = 'ftp://ftp.ebi.ac.uk/pub/databases/merops/current_release/pepunit.lib' download_file(merops_url, peptidase_faa, verbose=verbose) peptidase_mmseqs_db = path.join(output_dir, 'peptidases.%s.mmsdb' % get_iso_date()) make_mmseqs_db(peptidase_faa, peptidase_mmseqs_db, create_index=True, threads=threads, verbose=verbose) return peptidase_mmseqs_db
def download_and_process_kofam_hmms(kofam_profile_tar_gz=None, output_dir='.', verbose=False): if kofam_profile_tar_gz is None: kofam_profile_tar_gz = path.join(output_dir, 'kofam_profiles.tar.gz') download_file('ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz', kofam_profile_tar_gz, verbose=verbose) kofam_profiles = path.join(output_dir, 'kofam_profiles') mkdir(kofam_profiles) run_process(['tar', '-xzf', kofam_profile_tar_gz, '-C', kofam_profiles], verbose=verbose) merged_kofam_profiles = path.join(output_dir, 'kofam_profiles.hmm') merge_files(glob(path.join(kofam_profiles, 'profiles', '*.hmm')), merged_kofam_profiles) run_process(['hmmpress', '-f', merged_kofam_profiles], verbose=verbose) return merged_kofam_profiles
def download_and_process_uniref(uniref_fasta_zipped=None, output_dir='.', uniref_version='90', threads=10, verbose=True): """""" if uniref_fasta_zipped is None: # download database if not provided uniref_fasta_zipped = path.join(output_dir, 'uniref%s.fasta.gz' % uniref_version) uniref_url = 'ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref%s/uniref%s.fasta.gz' % \ (uniref_version, uniref_version) download_file(uniref_url, uniref_fasta_zipped, verbose=verbose) uniref_mmseqs_db = path.join( output_dir, 'uniref%s.%s.mmsdb' % (uniref_version, get_iso_date())) make_mmseqs_db(uniref_fasta_zipped, uniref_mmseqs_db, create_index=True, threads=threads, verbose=verbose) return uniref_mmseqs_db