def download_and_process_viral_refseq(merged_viral_faas=None, output_dir='.', viral_files=2, threads=10, verbose=True): """Can only download newest version""" # download all of the viral protein files, need to know the number of files # TODO: Make it so that you don't need to know number of viral files in refseq viral if merged_viral_faas is None: # download database if not provided faa_base_name = 'viral.%s.protein.faa.gz' viral_faa_glob = path.join(output_dir, faa_base_name % '*') for number in range(viral_files): number += 1 refseq_url = 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.%s.protein.faa.gz' % number refseq_faa = path.join(output_dir, faa_base_name % number) download_file(refseq_url, refseq_faa, verbose=verbose) # then merge files from above merged_viral_faas = path.join(output_dir, 'viral.merged.protein.faa.gz') run_process([ 'cat %s > %s' % (' '.join(glob(viral_faa_glob)), merged_viral_faas) ], shell=True) # make mmseqs database refseq_viral_mmseqs_db = path.join( output_dir, 'refseq_viral.%s.mmsdb' % get_iso_date()) make_mmseqs_db(merged_viral_faas, refseq_viral_mmseqs_db, create_index=True, threads=threads, verbose=verbose) return refseq_viral_mmseqs_db
def download_and_process_kofam_ko_list(kofam_ko_list_gz=None, output_dir='.', verbose=False): if kofam_ko_list_gz is None: kofam_ko_list_gz = path.join(output_dir, 'kofam_ko_list.tsv.gz') download_file('ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz', kofam_ko_list_gz, verbose=verbose) # TODO: fix this so that it is gunzipped to the path kofam_ko_list = path.join(output_dir, 'kofam_ko_list.tsv') run_process(['gunzip', kofam_ko_list_gz], verbose=verbose) return kofam_ko_list
def download_and_process_dbcan(dbcan_hmm=None, output_dir='.', dbcan_release='8', verbose=True): if dbcan_hmm is None: # download database if not provided dbcan_hmm = path.join(output_dir, 'dbCAN-HMMdb-V%s.txt' % dbcan_release) download_file( 'http://bcb.unl.edu/dbCAN2/download/Databases/dbCAN-HMMdb-V%s.txt' % dbcan_release, dbcan_hmm, verbose=verbose) run_process(['hmmpress', '-f', dbcan_hmm], verbose=verbose) return dbcan_hmm
def download_and_process_vogdb(vog_hmm_targz=None, output_dir='.', vogdb_release='latest', verbose=True): if vog_hmm_targz is None: vog_hmm_targz = path.join(output_dir, 'vog.hmm.tar.gz') vogdb_url = 'http://fileshare.csb.univie.ac.at/vog/%s/vog.hmm.tar.gz' % vogdb_release download_file(vogdb_url, vog_hmm_targz, verbose=verbose) hmm_dir = path.join(output_dir, 'vogdb_hmms') mkdir(hmm_dir) vogdb_targz = tarfile.open(vog_hmm_targz) vogdb_targz.extractall(hmm_dir) vog_hmms = path.join(output_dir, 'vog_%s_hmms.txt' % vogdb_release) merge_files(glob(path.join(hmm_dir, 'VOG*.hmm')), vog_hmms) run_process(['hmmpress', '-f', vog_hmms], verbose=verbose) return vog_hmms
def download_and_process_kofam_hmms(kofam_profile_tar_gz=None, output_dir='.', verbose=False): if kofam_profile_tar_gz is None: kofam_profile_tar_gz = path.join(output_dir, 'kofam_profiles.tar.gz') download_file('ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz', kofam_profile_tar_gz, verbose=verbose) kofam_profiles = path.join(output_dir, 'kofam_profiles') mkdir(kofam_profiles) run_process(['tar', '-xzf', kofam_profile_tar_gz, '-C', kofam_profiles], verbose=verbose) merged_kofam_profiles = path.join(output_dir, 'kofam_profiles.hmm') merge_files(glob(path.join(kofam_profiles, 'profiles', '*.hmm')), merged_kofam_profiles) run_process(['hmmpress', '-f', merged_kofam_profiles], verbose=verbose) return merged_kofam_profiles
def process_mmspro(full_alignment, output_dir, db_name='db', threads=10, verbose=True): mmseqs_msa = path.join(output_dir, '%s.mmsmsa' % db_name) run_process(['mmseqs', 'convertmsa', full_alignment, mmseqs_msa], verbose=verbose) mmseqs_profile = path.join(output_dir, '%s.mmspro' % db_name) run_process([ 'mmseqs', 'msa2profile', mmseqs_msa, mmseqs_profile, '--match-mode', '1', '--threads', str(threads) ], verbose=verbose) tmp_dir = path.join(output_dir, 'tmp') run_process([ 'mmseqs', 'createindex', mmseqs_profile, tmp_dir, '-k', '5', '-s', '7', '--threads', str(threads) ], verbose=verbose) return mmseqs_profile
def test_run_process(): run_process(['echo', 'Hello', 'World'], verbose=True) assert True