def process_kegg_db(output_dir, kegg_loc, gene_ko_link_loc=None, download_date=None, threads=10, verbose=True): if download_date is None: download_date = get_iso_date() if gene_ko_link_loc is not None: # add KOs to end of header where KO is not already there kegg_mod_loc = path.join(output_dir, 'kegg.mod.fa') write_sequence(generate_modified_kegg_fasta(kegg_loc, gene_ko_link_loc), format='fasta', into=kegg_mod_loc) else: kegg_mod_loc = kegg_loc # make mmseqsdb from modified kegg fasta kegg_mmseqs_db = path.join(output_dir, 'kegg.%s.mmsdb' % download_date) make_mmseqs_db(kegg_mod_loc, kegg_mmseqs_db, create_index=True, threads=threads, verbose=verbose) return kegg_mmseqs_db
def download_and_process_viral_refseq(merged_viral_faas=None, output_dir='.', viral_files=2, threads=10, verbose=True): """Can only download newest version""" # download all of the viral protein files, need to know the number of files # TODO: Make it so that you don't need to know number of viral files in refseq viral if merged_viral_faas is None: # download database if not provided faa_base_name = 'viral.%s.protein.faa.gz' viral_faa_glob = path.join(output_dir, faa_base_name % '*') for number in range(viral_files): number += 1 refseq_url = 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.%s.protein.faa.gz' % number refseq_faa = path.join(output_dir, faa_base_name % number) download_file(refseq_url, refseq_faa, verbose=verbose) # then merge files from above merged_viral_faas = path.join(output_dir, 'viral.merged.protein.faa.gz') run_process([ 'cat %s > %s' % (' '.join(glob(viral_faa_glob)), merged_viral_faas) ], shell=True) # make mmseqs database refseq_viral_mmseqs_db = path.join( output_dir, 'refseq_viral.%s.mmsdb' % get_iso_date()) make_mmseqs_db(merged_viral_faas, refseq_viral_mmseqs_db, create_index=True, threads=threads, verbose=verbose) return refseq_viral_mmseqs_db
def download_and_process_merops_peptidases(peptidase_faa=None, output_dir='.', threads=10, verbose=True): if peptidase_faa is None: # download database if not provided peptidase_faa = path.join(output_dir, 'merops_peptidases_nr.faa') merops_url = 'ftp://ftp.ebi.ac.uk/pub/databases/merops/current_release/pepunit.lib' download_file(merops_url, peptidase_faa, verbose=verbose) peptidase_mmseqs_db = path.join(output_dir, 'peptidases.%s.mmsdb' % get_iso_date()) make_mmseqs_db(peptidase_faa, peptidase_mmseqs_db, create_index=True, threads=threads, verbose=verbose) return peptidase_mmseqs_db
def download_and_process_uniref(uniref_fasta_zipped=None, output_dir='.', uniref_version='90', threads=10, verbose=True): """""" if uniref_fasta_zipped is None: # download database if not provided uniref_fasta_zipped = path.join(output_dir, 'uniref%s.fasta.gz' % uniref_version) uniref_url = 'ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref%s/uniref%s.fasta.gz' % \ (uniref_version, uniref_version) download_file(uniref_url, uniref_fasta_zipped, verbose=verbose) uniref_mmseqs_db = path.join( output_dir, 'uniref%s.%s.mmsdb' % (uniref_version, get_iso_date())) make_mmseqs_db(uniref_fasta_zipped, uniref_mmseqs_db, create_index=True, threads=threads, verbose=verbose) return uniref_mmseqs_db
def test_make_mmseqs_db(mmseqs_db_dir): faa_path = os.path.join('tests', 'data', 'NC_001422.faa') output_file = str(mmseqs_db_dir.join('mmseqs_db.mmsdb')) make_mmseqs_db(faa_path, output_file, True, 1) assert os.path.isfile(output_file)
def target_mmseqs_db(mmseqs_db_dir, phix_proteins): output_file = str(mmseqs_db_dir.join('target.mmsdb')) make_mmseqs_db(phix_proteins, output_file, True, 1) return output_file
def mmseqs_db(prodigal_faa, mmseqs_db_dir): output_file = str(mmseqs_db_dir.join('mmseqs_db.mmsdb')) make_mmseqs_db(prodigal_faa, output_file, True, 1) return output_file