Esempio n. 1
0
def process_kegg_db(output_dir,
                    kegg_loc,
                    gene_ko_link_loc=None,
                    download_date=None,
                    threads=10,
                    verbose=True):
    if download_date is None:
        download_date = get_iso_date()
    if gene_ko_link_loc is not None:
        # add KOs to end of header where KO is not already there
        kegg_mod_loc = path.join(output_dir, 'kegg.mod.fa')
        write_sequence(generate_modified_kegg_fasta(kegg_loc,
                                                    gene_ko_link_loc),
                       format='fasta',
                       into=kegg_mod_loc)
    else:
        kegg_mod_loc = kegg_loc
    # make mmseqsdb from modified kegg fasta
    kegg_mmseqs_db = path.join(output_dir, 'kegg.%s.mmsdb' % download_date)
    make_mmseqs_db(kegg_mod_loc,
                   kegg_mmseqs_db,
                   create_index=True,
                   threads=threads,
                   verbose=verbose)
    return kegg_mmseqs_db
Esempio n. 2
0
def download_and_process_viral_refseq(merged_viral_faas=None,
                                      output_dir='.',
                                      viral_files=2,
                                      threads=10,
                                      verbose=True):
    """Can only download newest version"""
    # download all of the viral protein files, need to know the number of files
    # TODO: Make it so that you don't need to know number of viral files in refseq viral

    if merged_viral_faas is None:  # download database if not provided
        faa_base_name = 'viral.%s.protein.faa.gz'
        viral_faa_glob = path.join(output_dir, faa_base_name % '*')
        for number in range(viral_files):
            number += 1
            refseq_url = 'ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.%s.protein.faa.gz' % number
            refseq_faa = path.join(output_dir, faa_base_name % number)
            download_file(refseq_url, refseq_faa, verbose=verbose)

        # then merge files from above
        merged_viral_faas = path.join(output_dir,
                                      'viral.merged.protein.faa.gz')
        run_process([
            'cat %s > %s' % (' '.join(glob(viral_faa_glob)), merged_viral_faas)
        ],
                    shell=True)

    # make mmseqs database
    refseq_viral_mmseqs_db = path.join(
        output_dir, 'refseq_viral.%s.mmsdb' % get_iso_date())
    make_mmseqs_db(merged_viral_faas,
                   refseq_viral_mmseqs_db,
                   create_index=True,
                   threads=threads,
                   verbose=verbose)
    return refseq_viral_mmseqs_db
Esempio n. 3
0
def download_and_process_merops_peptidases(peptidase_faa=None,
                                           output_dir='.',
                                           threads=10,
                                           verbose=True):
    if peptidase_faa is None:  # download database if not provided
        peptidase_faa = path.join(output_dir, 'merops_peptidases_nr.faa')
        merops_url = 'ftp://ftp.ebi.ac.uk/pub/databases/merops/current_release/pepunit.lib'
        download_file(merops_url, peptidase_faa, verbose=verbose)
    peptidase_mmseqs_db = path.join(output_dir,
                                    'peptidases.%s.mmsdb' % get_iso_date())
    make_mmseqs_db(peptidase_faa,
                   peptidase_mmseqs_db,
                   create_index=True,
                   threads=threads,
                   verbose=verbose)
    return peptidase_mmseqs_db
Esempio n. 4
0
def download_and_process_uniref(uniref_fasta_zipped=None,
                                output_dir='.',
                                uniref_version='90',
                                threads=10,
                                verbose=True):
    """"""
    if uniref_fasta_zipped is None:  # download database if not provided
        uniref_fasta_zipped = path.join(output_dir,
                                        'uniref%s.fasta.gz' % uniref_version)
        uniref_url = 'ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref%s/uniref%s.fasta.gz' % \
                     (uniref_version, uniref_version)
        download_file(uniref_url, uniref_fasta_zipped, verbose=verbose)
    uniref_mmseqs_db = path.join(
        output_dir, 'uniref%s.%s.mmsdb' % (uniref_version, get_iso_date()))
    make_mmseqs_db(uniref_fasta_zipped,
                   uniref_mmseqs_db,
                   create_index=True,
                   threads=threads,
                   verbose=verbose)
    return uniref_mmseqs_db
Esempio n. 5
0
def test_make_mmseqs_db(mmseqs_db_dir):
    faa_path = os.path.join('tests', 'data', 'NC_001422.faa')
    output_file = str(mmseqs_db_dir.join('mmseqs_db.mmsdb'))
    make_mmseqs_db(faa_path, output_file, True, 1)
    assert os.path.isfile(output_file)
Esempio n. 6
0
def target_mmseqs_db(mmseqs_db_dir, phix_proteins):
    output_file = str(mmseqs_db_dir.join('target.mmsdb'))
    make_mmseqs_db(phix_proteins, output_file, True, 1)
    return output_file
Esempio n. 7
0
def mmseqs_db(prodigal_faa, mmseqs_db_dir):
    output_file = str(mmseqs_db_dir.join('mmseqs_db.mmsdb'))
    make_mmseqs_db(prodigal_faa, output_file, True, 1)
    return output_file