def prepare_mash(data_folder, **kwargs): """ Make some folders and things """ append = kwargs.get('v2', '') # set up logdir if ('wd' in kwargs) and (kwargs.get('debug', False) == True): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False # Find mash excutable mash_exe = kwargs.get('exe_loc', None) if mash_exe == None: mash_exe = drep.get_exe('mash') # Make a folder to hold this information MASH_folder = os.path.join(data_folder, 'MASH_files{0}/'.format(append)) if not os.path.exists(MASH_folder): os.makedirs(MASH_folder) # Make a folder in there to store sketches sketch_folder = os.path.join(MASH_folder, 'sketches{0}/'.format(append)) if not os.path.exists(sketch_folder): os.makedirs(sketch_folder) return logdir, MASH_folder, sketch_folder, mash_exe
def run_prodigal(genome_list, out_dir, **kwargs): ''' Run prodigal on a set of genomes, store the output in the out_dir Args: genome_list: list of genomes to run prodigal on out_dir: output directory to store prodigal output Keyword Args: processors: number of processors to multithread with exe_loc: location of prodigal excutible (will try and find with shutil if not provided) debug: log all of the commands wd: if you want to log commands, you also need the wd ''' # Get set up t = kwargs.get('processors', '6') loc = kwargs.get('exe_loc', None) if loc == None: loc = drep.get_exe('prodigal') # Make sure it's a list assert type(genome_list) == list # Make list of commands cmds = [] for genome in genome_list: fna = "{0}{1}".format(os.path.join(out_dir, os.path.basename(genome)), '.fna') faa = "{0}{1}".format(os.path.join(out_dir, os.path.basename(genome)), '.faa') if os.path.exists(fna) and os.path.exists(faa): pass else: cmds.append([ 'prodigal', '-i', genome, '-d', fna, '-a', faa, '-m', '-p', 'meta' ]) # Run commands if len(cmds) > 0: if ('wd' in kwargs) and (kwargs.get('debug', False) == True): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False #logdir = "/home/mattolm/Programs/drep/tests/test_backend/logs/" drep.thread_cmds(cmds, shell=False, logdir=logdir, t=int(t)) else: logging.info("Past prodigal runs found- will not re-run")
def run_pairwise_fastANI(genome_list, outdir, **kwargs): p = kwargs.get('processors', 6) code = drep.d_cluster.utils._randomString(stringLength=10) # Make folders if not os.path.exists(outdir): os.makedirs(outdir) tmp_dir = os.path.join(outdir, 'tmp/') if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Make genome list glist = os.path.join(tmp_dir, 'genomeList') glist = _make_glist(genome_list, glist) # Gen command exe_loc = drep.get_exe('fastANI') out_base = os.path.join(outdir, 'fastANI_out_{0}'.format(code)) cmd = [ exe_loc, '--ql', glist, '--rl', glist, '-o', out_base, '--matrix', '-t', str(p), '--minFraction', str(0) ] logging.debug(' '.join(cmd) + ' ' + code) # Run command if ('wd' in kwargs) and (kwargs.get('debug', False)): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False drep.thread_cmds([cmd], shell=False, logdir=logdir, t=1) # Load results fdb = load_fastani(out_base) # fix missing ones try: fdb = _fix_fastani(fdb) return fdb # handle broken self except: logging.error( "CRITICAL ERROR WITH SECONDARY CLUSTERING CODE {0}; SKIPPING". format(code)) return pd.DataFrame()
def run_pairwise_goANI(bdb, goANI_folder, prod_folder, **kwargs): ''' Run pairwise goANI on a list of Genomes Args: bdb: DataFrame with ['genome', 'location'] goANI_folder: folder to store gANI output prod_folder: folder containing prodigal output from genomes (will run if needed) Keyword arguments: debug: log all of the commands wd: if you want to log commands, you also need the wd processors: threads to use Returns: DataFrame: Ndb for gANI ''' p = kwargs.get('processors', 6) nsimscan_exe = drep.get_exe('nsimscan') genomes = bdb['location'].tolist() # Make folders if not os.path.exists(goANI_folder): os.makedirs(goANI_folder) if not os.path.exists(prod_folder): os.makedirs(prod_folder) # Run prodigal logging.debug("Running prodigal...") drep.d_filter.run_prodigal(bdb['location'].tolist(), prod_folder, **kwargs) # Gen gANI commands logging.debug("Running goANI...") cmds = [] files = [] for i, g1 in enumerate(genomes): # Make it so each reference is it's own folder, to spread out .delta files cur_folder = os.path.join( goANI_folder, drep.d_cluster.utils._get_genome_name_from_fasta(g1)) if not os.path.exists(cur_folder): os.makedirs(cur_folder) for j, g2 in enumerate(genomes): if i != j: name1 = drep.d_cluster.utils._get_genome_name_from_fasta(g1) name2 = drep.d_cluster.utils._get_genome_name_from_fasta(g2) file_name = "{0}/{1}_vs_{2}.sim".format( cur_folder, name1, name2) files.append(file_name) # If the file doesn't already exist, add it to what needs to be run if not os.path.isfile(file_name): fna1 = "{0}.fna".format(os.path.join(prod_folder, name1)) fna2 = "{0}.fna".format(os.path.join(prod_folder, name2)) cmds.append( drep.d_cluster.utils.gen_goANI_cmd( file_name, fna1, fna2, nsimscan_exe)) # Run commands if len(cmds) > 0: logging.debug('Running goANI commands: {0}'.format('\n'.join( [' '.join(x) for x in cmds]))) if ('wd' in kwargs) and (kwargs.get('debug', False) == True): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False #logdir = "/home/mattolm/Programs/drep/tests/test_backend/logs/" drep.thread_cmds(cmds, logdir=logdir, t=int(p)) else: logging.debug("goANI already run- will not re-run") # Parse output df = drep.d_cluster.utils.process_goani_files(files) # Add self-comparisons if there is only one genome if len(genomes) == 1: Table = { 'querry': [], 'reference': [], 'ani': [], 'alignment_coverage': [] } for g in genomes: Table['reference'].append( drep.d_cluster.utils._get_genome_name_from_fasta(g)) Table['querry'].append( drep.d_cluster.utils._get_genome_name_from_fasta(g)) Table['ani'].append(1) Table['alignment_coverage'].append(1) d = pd.DataFrame(Table) df = pd.concat([df, d], ignore_index=True) return df
def all_vs_all_MASH(Bdb, data_folder, **kwargs): """ Run MASH pairwise within all samples in Bdb Args: Bdb: dataframe with genome, location data_folder: location to store output files Keyword Args: MASH_sketch: size of mash sketches dry: dont actually run anything processors: number of processors to multithread with mash_exe: location of mash excutible (will try and find with shutil if not provided) groupSize: max number of mash sketches to hold in each folder debug: if True, log all of the commands wd: if you want to log commands, you also need the wd """ MASH_s = kwargs.get('MASH_sketch', 1000) dry = kwargs.get('dry', False) # overwrite = kwargs.get('overwrite', False) mash_exe = kwargs.get('mash_exe', None) p = kwargs.get('processors', 6) groupSize = kwargs.get('groupSize', 1000) # set up logdir if ('wd' in kwargs) and (kwargs.get('debug', False) == True): logdir = kwargs.get('wd').get_dir('cmd_logs') else: logdir = False # Find mash mash_exe = kwargs.get('exe_loc', None) if mash_exe == None: mash_exe = drep.get_exe('mash') # Set up folders MASH_folder = os.path.join(data_folder, 'MASH_files/') if not os.path.exists(MASH_folder): os.makedirs(MASH_folder) sketch_folder = os.path.join(MASH_folder, 'sketches/') if not os.path.exists(sketch_folder): os.makedirs(sketch_folder) # Make chunks l2g = Bdb.set_index('location')['genome'].to_dict() locations = list(Bdb['location'].unique()) chunks = [ locations[x:x + groupSize] for x in range(0, len(locations), groupSize) ] # Make the MASH sketches cmds = [] chunk_folders = [] for i, chunk in enumerate(chunks): chunk_folder = os.path.join(sketch_folder, "chunk_{0}".format(i)) chunk_folders.append(chunk_folder) if not os.path.exists(chunk_folder): os.makedirs(chunk_folder) for fasta in chunk: genome = l2g[fasta] file = os.path.join(chunk_folder, genome) if not os.path.isfile(file + '.msh'): cmd = [ mash_exe, 'sketch', fasta, '-s', str(MASH_s), '-o', file ] cmds.append(cmd) if not dry: if len(cmds) > 0: drep.thread_cmds(cmds, logdir=logdir, t=int(p)) # Combine MASH sketches within chunk cmds = [] alls = [] for chunk_folder in chunk_folders: all_file = os.path.join(chunk_folder, 'chunk_all.msh') cmd = [mash_exe, 'paste', all_file] \ + glob.glob(os.path.join(chunk_folder, '*')) cmds.append(cmd) alls.append(all_file) if not dry: if len(cmds) > 0: drep.thread_cmds(cmds, logdir=logdir, t=int(p)) # Combine MASH sketches of all chunks all_file = os.path.join(MASH_folder, 'ALL.msh') cmd = [mash_exe, 'paste', all_file] + alls drep.run_cmd(cmd, dry, shell=False, logdir=logdir) # Calculate distances cmd = [ mash_exe, 'dist', '-p', str(p), all_file, all_file, '>', MASH_folder + 'MASH_table.tsv' ] cmd = ' '.join(cmd) drep.run_cmd(cmd, dry, shell=True, logdir=logdir) # Make Mdb based on all genomes in the MASH folder file = MASH_folder + 'MASH_table.tsv' iniCols = ['genome1', 'genome2', 'dist', 'p', 'kmers'] uCols = ['genome1', 'genome2', 'dist'] dTypes = {'genome1': 'category', 'genome2': 'category', 'dist': np.float32} Mdb = pd.read_csv(file, names=iniCols, usecols=uCols, dtype=dTypes, sep='\t') Mdb['genome1'] = Mdb['genome1'].apply(_get_genome_name_from_fasta) Mdb['genome2'] = Mdb['genome2'].apply(_get_genome_name_from_fasta) Mdb['similarity'] = 1 - Mdb['dist'] # Filter out those genomes that are in the MASH folder but shouldn't be in Mdb genomes = Bdb['genome'].unique() Mdb = Mdb[Mdb['genome1'].isin(genomes)] Mdb = Mdb[Mdb['genome2'].isin(genomes)] # Reorder categories to be correct for g in ['genome1', 'genome2']: Mdb[g] = Mdb[g].cat.remove_unused_categories() Mdb[g] = Mdb[g].cat.reorder_categories(sorted((Mdb[g].unique())), ordered=True) return Mdb