Beispiel #1
0
def run_checkM(genome_folder, checkm_outf, **kwargs):
    import drep.d_bonus as dBonus
    t = str(kwargs.get('processors', '6'))
    loc, works = dBonus.find_program('checkm')
    if loc == None:
        logging.error('Cannot locate the program {0}- make sure its in the system path'\
            .format('checkm'))
        sys.exit()
    if works == False:
        logging.error('Program {0} is not working!! Im going to crash now'\
            .format('checkm'))
        sys.exit()
    check_exe = loc

    checkm_method = kwargs.get('checkM_method', 'lineage_wf')

    # Run checkM initial
    if checkm_method == 'taxonomy_wf':
        cmd = [check_exe,checkm_method,'domain','Bacteria',genome_folder,checkm_outf,'-f',\
           checkm_outf + '/results.tsv','--tab_table','-t',str(t),'-g','-x','faa']
    else:
        cmd = [check_exe,checkm_method,genome_folder,checkm_outf,'-f',\
           checkm_outf + '/results.tsv','--tab_table','-t',str(t),'--pplacer_threads',\
           str(t),'-g','-x','faa']

    logging.debug("Running CheckM with command: {0}".format(cmd))

    if 'wd' in kwargs:
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False
    drep.run_cmd(cmd, shell=False, logdir=logdir)

    # Run checkM again for the better table
    if checkm_method == 'taxonomy_wf':
        lineage = checkm_outf + 'Bacteria.ms'
    else:
        lineage = checkm_outf + 'lineage.ms'
    desired_file = checkm_outf + 'Chdb.tsv'
    cmd = [check_exe,'qa', lineage, checkm_outf, '-f', desired_file, '-t',\
            str(t), '--tab_table','-o', '2']
    logging.debug("Running CheckM with command: {0}".format(cmd))

    if 'wd' in kwargs:
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False
    drep.run_cmd(cmd, shell=False, logdir=logdir)

    # Load table and return it
    try:
        chdb = pd.read_table(desired_file, sep='\t')
    except:
        logging.error("!!! checkM failed !!!\nIf using pyenv, make sure both python2 and " +\
            "python3 are available (for example: pyenv global 3.5.1 2.7.9)")
        sys.exit()
    return chdbfix_
Beispiel #2
0
    def unit_tests_6(self):
        '''
        Test drep call commands
        '''
        # try on single mash command

        wd   = WorkDirectory(self.working_wd_loc)
        MASH_folder = wd.get_dir('MASH')
        log_folder = wd.get_dir('cmd_logs')

        mash_exe = 'mash'
        all_file = MASH_folder + 'ALL.msh'

        cmd = [mash_exe, 'dist', all_file, all_file, '>', MASH_folder
            + 'MASH_table.tsv']
        cmd = ' '.join(cmd)
        drep.run_cmd(cmd, shell=True, logdir=log_folder)

        assert len(glob.glob(log_folder + '*')) == 3
Beispiel #3
0
def run_mash_on_genome_chunks(genome_chunks, mash_exe, sketch_folder,
                              MASH_folder, logdir, **kwargs):
    dry = kwargs.get('dry', False)
    p = kwargs.get('processors', 6)
    MASH_s = kwargs.get('MASH_sketch', 1000)
    multi_round = kwargs.get('multiround_primary_clustering', True)

    # Step 1) Create Mash sketches
    cmds = []
    for GC in genome_chunks:
        cmds += GC.gen_sketch_cmds(mash_exe, MASH_s)
    if (not dry) & (len(cmds) > 0):
        drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Step 2) Combine MASH sketches within chunks
    cmds = [GC.gen_paste_cmd(mash_exe) for GC in genome_chunks]
    if (not dry) & (len(cmds) > 0):
        drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Merge the pasted chunks and make a new genomeChunk if thats what you want
    if (not multi_round) & (len(genome_chunks) > 1):
        cmd, new_gc = drep.d_cluster.utils.merge_genome_chunks(
            mash_exe, genome_chunks, sketch_folder, MASH_folder)
        genome_chunks = [new_gc]
        drep.run_cmd(cmd, dry, shell=False, logdir=logdir)

    # Step 3) Run Mash on each chunk
    cmds = [GC.gen_dist_cmd(mash_exe, MASH_folder, p) for GC in genome_chunks]
    for j, cmd in enumerate(cmds):
        if not dry:
            if len(cmds) > 1:
                logging.info(f"  Comparing group {j+1} of {len(cmds)}")
            drep.run_cmd(cmd, dry, shell=True, logdir=logdir)

    # Step 4) Load the Mash tables of each chunk
    for GC in genome_chunks:
        GC.load_mash_table()

    return genome_chunks
Beispiel #4
0
def run_checkM(genome_folder, checkm_outf, **kwargs):
    '''
    Run checkM

    WARNING- this will result in wrong genome lenth and genome N50 estimate, due to
    it being run on prodigal output

    Args:
        genome_folder: location of folder to run checkM on - should be full of files ending in .faa (result of prodigal)
        checkm_outf: location of folder to store checkM output

    Keyword args:
        processors: number of threads
        checkm_method: either lineage_wf or taxonomy_wf
        debug: log all of the commands
        wd: if you want to log commands, you also need the wd
        set_recursion: if not 0, set the python recursion
    '''
    # Find checkm exe
    loc, works = drep.d_bonus.find_program('checkm')
    if loc == None:
        logging.error('Cannot locate the program {0}- make sure its in the system path'\
            .format('checkm'))
        sys.exit()
    if works == False:
        logging.error('Program {0} is not working!! Im going to crash now'\
            .format('checkm'))
        sys.exit()
    check_exe = loc

    # Get set up
    t = str(kwargs.get('processors', '6'))
    checkm_method = kwargs.get('checkM_method', 'lineage_wf')

    # Set recursion
    R = kwargs.get('set_recursion', '0')
    if R != '0':
        logging.warning('Setting Maximum Recursion depth to {0}'.format(R))
        sys.setrecursionlimit(int(R))

    # Run checkM initial
    if checkm_method == 'taxonomy_wf':
        cmd = [check_exe,checkm_method,'domain','Bacteria',genome_folder,checkm_outf,'-f',\
           checkm_outf + '/results.tsv','--tab_table','-t',str(t),'-g','-x','faa']
    else:
        cmd = [check_exe,checkm_method,genome_folder,checkm_outf,'-f',\
           checkm_outf + '/results.tsv','--tab_table','-t',str(t),'--pplacer_threads',\
           str(t),'-g','-x','faa']

    logging.debug("Running CheckM with command: {0}".format(cmd))

    if ('wd' in kwargs) & (kwargs.get('debug', False) == True):
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False
    drep.run_cmd(cmd, shell=False, logdir=logdir)

    # Run checkM again for the better table
    if checkm_method == 'taxonomy_wf':
        lineage = checkm_outf + 'Bacteria.ms'
    else:
        lineage = checkm_outf + 'lineage.ms'
    desired_file = checkm_outf + 'Chdb.tsv'
    cmd = [check_exe,'qa', lineage, checkm_outf, '-f', desired_file, '-t',\
            str(t), '--tab_table','-o', '2']
    logging.debug("Running CheckM with command: {0}".format(cmd))

    if ('wd' in kwargs) & (kwargs.get('debug', False) == True):
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False
    drep.run_cmd(cmd, shell=False, logdir=logdir)

    # Load table
    try:
        chdb = pd.read_table(desired_file, sep='\t')
    except:
        logging.error(
            "!!! checkM failed !!!\nYou can run again with the --debug option to see what went wrong (command logs will be created in the log folder)"
        )
        sys.exit()

    # Return table
    return chdb
Beispiel #5
0
def run_checkM(genome_folder_whole, checkm_outf_whole, **kwargs):
    '''
    Run checkM

    WARNING- this will result in wrong genome length and genome N50 estimate, due to
    it being run on prodigal output

    Args:
        genome_folder: location of folder to run checkM on - should be full of files ending in .faa (result of prodigal)
        checkm_outf: location of folder to store checkM output

    Keyword args:
        processors: number of threads
        checkm_method: either lineage_wf or taxonomy_wf
        debug: log all of the commands
        wd: if you want to log commands, you also need the wd
        set_recursion: if not 0, set the python recursion
    '''
    # Get set up
    check_exe = _checkm_get_exe()
    t = str(kwargs.get('processors','6'))
    checkm_method = kwargs.get('checkM_method', 'lineage_wf')
    checkm_group_size = kwargs.get('checkm_group_size', 1000)

    # Set recursion
    R = kwargs.get('set_recursion', '0')
    if R != '0':
        logging.warning('Setting Maximum Recursion depth to {0}'.format(R))
        sys.setrecursionlimit(int(R))

    # Establish groups
    dbs = []
    for genome_folder, checkm_outf in _iterate_checkm_groups(genome_folder_whole, checkm_outf_whole, checkm_group_size):

        # Run checkM initial
        if checkm_method == 'taxonomy_wf':
             cmd = [check_exe,checkm_method,'domain','Bacteria',genome_folder,checkm_outf,'-f',\
                checkm_outf + '/results.tsv','--tab_table','-t',str(t),'-g','-x','faa']
        else:
             cmd = [check_exe,checkm_method,genome_folder,checkm_outf,'-f',\
                checkm_outf + '/results.tsv','--tab_table','-t',str(t),'--pplacer_threads',\
                str(t),'-g','-x','faa']

        logging.debug("Running CheckM with command: {0}".format(' '.join(cmd)))

        if ('wd' in kwargs) & (kwargs.get('debug', False) == True):
            logdir = kwargs.get('wd').get_dir('cmd_logs')
        else:
            logdir = False
        drep.run_cmd(cmd, shell=False, logdir=logdir)

        # Run checkM again for the better table
        if checkm_method == 'taxonomy_wf':
            lineage = checkm_outf + 'Bacteria.ms'
        else:
            lineage = checkm_outf + 'lineage.ms'
        desired_file = checkm_outf + 'Chdb.tsv'
        cmd = [check_exe,'qa', lineage, checkm_outf, '-f', desired_file, '-t',\
                str(t), '--tab_table','-o', '2']
        logging.debug("Running CheckM with command: {0}".format(' '.join(cmd)))

        if ('wd' in kwargs) & (kwargs.get('debug', False) == True):
            logdir = kwargs.get('wd').get_dir('cmd_logs')
        else:
            logdir = False
        drep.run_cmd(cmd, shell=False, logdir=logdir)

        # Load table
        try:
            chdb = pd.read_table(desired_file,sep='\t')
        except:
            logging.error("!!! checkM failed !!!\nSee https://drep.readthedocs.io/en/latest/advanced_use.html#troubleshooting-checkm for help troubleshooting")
            sys.exit()

        # Return table
        dbs.append(chdb)

    return pd.concat(dbs).reset_index(drop=True)
Beispiel #6
0
def all_vs_all_MASH(Bdb, data_folder, **kwargs):
    """
    Run MASH pairwise within all samples in Bdb

    Args:
        Bdb: dataframe with genome, location
        data_folder: location to store output files

    Keyword Args:
        MASH_sketch: size of mash sketches
        dry: dont actually run anything
        processors: number of processors to multithread with
        mash_exe: location of mash excutible (will try and find with shutil if not provided)
        groupSize: max number of mash sketches to hold in each folder
        debug: if True, log all of the commands
        wd: if you want to log commands, you also need the wd
    """

    MASH_s = kwargs.get('MASH_sketch', 1000)
    dry = kwargs.get('dry', False)
    # overwrite = kwargs.get('overwrite', False)
    mash_exe = kwargs.get('mash_exe', None)
    p = kwargs.get('processors', 6)
    groupSize = kwargs.get('groupSize', 1000)

    # set up logdir
    if ('wd' in kwargs) and (kwargs.get('debug', False) == True):
        logdir = kwargs.get('wd').get_dir('cmd_logs')
    else:
        logdir = False

    # Find mash
    mash_exe = kwargs.get('exe_loc', None)
    if mash_exe == None:
        mash_exe = drep.get_exe('mash')

    # Set up folders
    MASH_folder = os.path.join(data_folder, 'MASH_files/')
    if not os.path.exists(MASH_folder):
        os.makedirs(MASH_folder)

    sketch_folder = os.path.join(MASH_folder, 'sketches/')
    if not os.path.exists(sketch_folder):
        os.makedirs(sketch_folder)

    # Make chunks
    l2g = Bdb.set_index('location')['genome'].to_dict()
    locations = list(Bdb['location'].unique())
    chunks = [
        locations[x:x + groupSize] for x in range(0, len(locations), groupSize)
    ]

    # Make the MASH sketches
    cmds = []
    chunk_folders = []
    for i, chunk in enumerate(chunks):
        chunk_folder = os.path.join(sketch_folder, "chunk_{0}".format(i))
        chunk_folders.append(chunk_folder)
        if not os.path.exists(chunk_folder):
            os.makedirs(chunk_folder)
        for fasta in chunk:
            genome = l2g[fasta]
            file = os.path.join(chunk_folder, genome)
            if not os.path.isfile(file + '.msh'):
                cmd = [
                    mash_exe, 'sketch', fasta, '-s',
                    str(MASH_s), '-o', file
                ]
                cmds.append(cmd)

    if not dry:
        if len(cmds) > 0:
            drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Combine MASH sketches within chunk
    cmds = []
    alls = []
    for chunk_folder in chunk_folders:
        all_file = os.path.join(chunk_folder, 'chunk_all.msh')
        cmd = [mash_exe, 'paste', all_file] \
                + glob.glob(os.path.join(chunk_folder, '*'))
        cmds.append(cmd)
        alls.append(all_file)
    if not dry:
        if len(cmds) > 0:
            drep.thread_cmds(cmds, logdir=logdir, t=int(p))

    # Combine MASH sketches of all chunks
    all_file = os.path.join(MASH_folder, 'ALL.msh')
    cmd = [mash_exe, 'paste', all_file] + alls
    drep.run_cmd(cmd, dry, shell=False, logdir=logdir)

    # Calculate distances
    cmd = [
        mash_exe, 'dist', '-p',
        str(p), all_file, all_file, '>', MASH_folder + 'MASH_table.tsv'
    ]
    cmd = ' '.join(cmd)
    drep.run_cmd(cmd, dry, shell=True, logdir=logdir)

    # Make Mdb based on all genomes in the MASH folder
    file = MASH_folder + 'MASH_table.tsv'

    iniCols = ['genome1', 'genome2', 'dist', 'p', 'kmers']
    uCols = ['genome1', 'genome2', 'dist']
    dTypes = {'genome1': 'category', 'genome2': 'category', 'dist': np.float32}
    Mdb = pd.read_csv(file,
                      names=iniCols,
                      usecols=uCols,
                      dtype=dTypes,
                      sep='\t')
    Mdb['genome1'] = Mdb['genome1'].apply(_get_genome_name_from_fasta)
    Mdb['genome2'] = Mdb['genome2'].apply(_get_genome_name_from_fasta)
    Mdb['similarity'] = 1 - Mdb['dist']

    # Filter out those genomes that are in the MASH folder but shouldn't be in Mdb
    genomes = Bdb['genome'].unique()
    Mdb = Mdb[Mdb['genome1'].isin(genomes)]
    Mdb = Mdb[Mdb['genome2'].isin(genomes)]

    # Reorder categories to be correct
    for g in ['genome1', 'genome2']:
        Mdb[g] = Mdb[g].cat.remove_unused_categories()
        Mdb[g] = Mdb[g].cat.reorder_categories(sorted((Mdb[g].unique())),
                                               ordered=True)

    return Mdb