def trimmo_module(files, path_name, sample_name, threads, Debug, trimmomatic_adapters): ## ## This functions generates a trimmomatic call using java and trimmomatic from ## the system with a minimum version (specified in config.py) ## Checks if adapter file exists ## Returns code from trimmo_call: OK/FAIL ## ## get exe trimmomatic_jar = set_config.get_exe('trimmomatic') java_path = set_config.get_exe('java') ## check if it exists if os.path.isfile(trimmomatic_adapters): ## debug message if (Debug): print (colored("**DEBUG: trimmomatic_adapters file exists **", 'yellow')) print (trimmomatic_adapters) else: ## rise error & exit print (colored("***ERROR: Trimmomatic adapters file does not exist: " + trimmomatic_adapters,'red')) exit() ## call return(trimmo_call(java_path, path_name, sample_name, files, trimmomatic_jar, threads, trimmomatic_adapters, Debug))
def snippy_core_call(list_folder, options, name, output_dir, output_format, Debug): """ Create core alignment for samples align to the same reference ATTENTION: Requires sample names to be different within the first 10 characters. :param list_folder: :param options: :param name: :param output_dir: :param output_format: :param Debug: :type list_folder: list :type options: string :type name: string :type output_dir: :type output_format: :type Debug: """ ## create snippy-core call snippy_core_exe = set_config.get_exe('snippy_core', Debug) ## start snippy_cmd list_folder_string = " ".join(list_folder) log_file = os.path.join(output_dir, "snippy_cmd.log") name_outdir = os.path.join(output_dir, name) ## use one reference: must be the same for all comparisons reference_fasta = list_folder[1] + "/ref.fa" snippy_core_cmd = '%s -aformat %s --ref %s --prefix %s %s 2> %s' %(snippy_core_exe, output_format, reference_fasta, name_outdir, list_folder_string, log_file) return (HCGB_sys.system_call(snippy_core_cmd))
def main(): ## this code runs when call as a single script ## control if options provided or help if len(sys.argv) > 1: print ("") else: help_options() exit() ## arguments name = argv[1] fasta = os.path.abspath(argv[2]) folder = os.path.abspath(argv[3]) sample_name = argv[4] threads = argv[5] files = [] for i,e in enumerate(argv): if i > 5: files.append(os.path.abspath(argv[i])) ## other cutoff=80 kma_bin = set_config.get_exe("kma") out_file = sample_name + ".out_kma-search.txt" ## check if database is indexed if not check_db_indexed(name, folder): index_database(fasta, kma_bin, name, 'new', folder, '') print ("\n+ Database indexed") ## search files kma_ident_call(out_file, files, sample_name, folder + '/' + name, kma_bin, '', threads)
def ml_tree(folder, name, threads, output, Debug): """ Create Maximum Likelihood tree reconstruction We use IQ-Tree for the versatility and the ability to automatically set parameters. :param folder: Snippy-core folder containing results. :param name: Name of the analysis. :param Debug: True/false for debugging messages :type folder: string :type name: string :type Debug: bool """ iqtree_exe = set_config.get_exe('iqtree', Debug) bootstrap_number = '1000' aln_file = os.path.join(folder, name + '.aln') output_log = os.path.join(output, 'iqtree.error.log') output_files = os.path.join(output, 'iqtree_' + name) iqtree_cmd = '%s -s %s -redo --threads-max %s --prefix %s -B %s 2> %s' % ( iqtree_exe, aln_file, threads, output_files, bootstrap_number, output_log) code = HCGB_sys.system_call(iqtree_cmd) if code == 'OK': return () else: print("Some error occurred...") return ()
def run_module_SPADES_old(name, folder, file1, file2, threads): print ("+ Calling spades assembly for sample...", name) ## folder create HCGB_files.create_folder(folder) ## get configuration SPADES_bin = set_config.get_exe('spades') ## assembly main path_to_contigs = run_SPADES_assembly(folder, file1, file2, name, SPADES_bin, threads) ## assembly plasmids path_to_plasmids = run_SPADES_plasmid_assembly(folder, file1, file2, name, SPADES_bin, threads) ## discard plasmids from main (tmp_contigs, tmp_plasmids) = discardPlasmids(path_to_contigs, path_to_plasmids, folder, name) ## rename fasta sequences new_contigs = tmp_contigs.split(".fna.tmp")[0] + '.fna' rename_contigs(tmp_contigs, "scaffolds_chr", new_contigs) new_plasmids="" if os.path.isfile(tmp_plasmids): new_plasmids = tmp_plasmids.split(".fna.tmp")[0] + '.fna' rename_contigs(tmp_plasmids, "scaffolds_plasmids", new_plasmids) ## contig stats stats(new_contigs, new_plasmids) ## success stamps filename_stamp = folder + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp)
def generate_seq_search_call(db, query, outfile, revcomp, start=0, end=-1, format='fasta'): ## Sequence Range ## -seq_start First sequence position to retrieve ## -seq_stop Last sequence position to retrieve ## -strand 1 = forward DNA strand, 2 = reverse complement ## -revcomp Shortcut for strand 2 efetch_bin = set_config.get_exe("efetch") cmd = ("%s -db %s -id %s -seq_start %s -seq_stop %s -format %s" % (efetch_bin, db, query, start, end, format)) ## add reverse complement if (revcomp): cmd = cmd + ' -revcomp' ## add output file cmd = cmd + ' > %s' % outfile return (HCGB_sys.system_call(cmd))
def module_call(sequence_fasta, kingdom, genus, path, name, threads): """ Function that checks and generates annotation. - It uses Prokka_ via :func:`BacterialTyper.scripts.annotation.prokka_call`. - It checks if previously generated - Once finished, it prints timestamp :param sequence_fasta: Assembled sequences in fasta file format. :param kingdom: Available kingdoms mode for Prokka software: Archaea|Bacteria|Mitochondria|Viruses :param genus: Available genus options for Prokka software. See details above. :param path: Absolute path to the output folder to include results. :param name: Sample name and tag to include in the annotation report and files. :param threads: Number of CPUs to use. :type sequence_fasta: string :type kingdom: string :type genus: string :type path: string :type name: string :type threads: integer .. seealso:: This function depends on other BacterialTyper functions called: - :func:`BacterialTyper.scripts.set_config.get_exe` - :func:`HCGB.functions.time_functions.read_time_stamp` - :func:`HCGB.functions.time_functions.print_time_stamp` - :func:`HCGB.functions.time_functions.prokka_call` .. include:: ../../links.inc """ ## check if previously assembled and succeeded filename_stamp = path + '/.success' if os.path.isdir(path): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) return () ## call prokka prokka_bin = set_config.get_exe('prokka') dirname = prokka_call(prokka_bin, sequence_fasta, kingdom, genus, path, name, threads) ## success stamps filename_stamp = path + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) return (dirname)
def BUSCO_plot(outfolder): busco_plot_bin = set_config.get_exe('generate_plot') os.chdir(outfolder) #logFile = dataset_name + '.log' cmd = '%s -wd %s' %(busco_plot_bin, outfolder) HCGB_sys.system_call(cmd) return()
def run_module_assembly(name, folder, file1, file2, threads, debug=False): """Assembly main module call. It calls assembly function to process data provided and returns genome statistics. Steps: - Retrieves SPADES_ executable (See details :func:`BacterialTyper.scripts.set_config.get_exe`) using the minimun version required (See :func:`BacterialTyper.scripts.set_config.min_version_programs` for details) - It generates a call to SPADES_ assembler (See :func:`BacterialTyper.scripts.spades_assembler.run_SPADES_assembly`). - If assembly succeeds and fasta file is generated under the directory provided, contig statistics are generated (:func:`BacterialTyper.scripts.spades_assembler.contig_stats`). - It retrieves spades executable using :param name: Sample name or tag to identify sample :param folder: Absolute path to folder. :param file1: Absolute path to fastq reads (R1). :param file2: Absolute path to fastq reads (R2). :param threads: Number of CPUs to use. :type name: string :type folder: string :type file1: string :type file2: string :type threads: integer :return: Assembly statistics file. :rtype: string : Path to file assembly statistics file. :warnings: Returns **FAIL** if assembly process stopped. .. seealso:: This function depends on other BacterialTyper functions called: - :func:`BacterialTyper.scripts.set_config.get_exe` - :func:`BacterialTyper.scripts.spades_assembler.run_SPADES_assembly` - :func:`BacterialTyper.scripts.set_config.min_version_programs` - :func:`BacterialTyper.scripts.spades_assembler.contig_stats` .. include:: ../../links.inc """ print ("+ Calling spades assembly for sample...", name) ## get configuration SPADES_bin = set_config.get_exe('spades') ## assembly main path_to_contigs = run_SPADES_assembly(folder, file1, file2, name, SPADES_bin, threads, debug) if path_to_contigs == 'FAIL': return ('FAIL') else: ## contig stats #print ('+ Get assembly statistics:...\n') (stats_dict, excel_file) = contig_stats(path_to_contigs, debug) ## check statistics in file print ("+ Check statistics for sample %s in file:\n%s" %(name, excel_file)) return([stats_dict, excel_file])
def blastn(outFile, DBname, fasta, threads): # blastn plasmids vs contigs blastnexe = set_config.get_exe('blastn') cmd_blastn = "%s -db %s -query %s -out %s -evalue 1e-20 -outfmt \'6 std qlen slen\' -num_threads %s" % ( blastnexe, DBname, fasta, outFile, threads) codeBlastn = system_call(cmd_blastn) if (codeBlastn == 'FAIL'): print( colored('****ERROR: Some error happened during the blastn command', 'red')) print(cmd_blastn) exit()
def makeblastdb(DBname, fasta): ## generate blastdb for genome makeblastDBexe = set_config.get_exe('makeblastdb') if (os.path.isfile(DBname + '.nhr')): print("+ BLAST database is already generated...") else: cmd_makeblast = "%s -in %s -input_type fasta -dbtype %s -out %s" % ( makeblastDBexe, fasta, 'nucl', DBname) code = system_call(cmd_makeblast) if (code == 'FAIL'): print( colored( '****ERROR: Some error happened during the makeblastDB command', 'red')) print(cmd_makeblast) exit()
def get_external_kma(kma_external_files, Debug): print('\t- Get additional kma databases:') ## external sequences provided are indexed and generated in the same folder provided option_db = "" if (kma_external_files): kma_external_files = set(kma_external_files) kma_external_files = [os.path.abspath(f) for f in kma_external_files] ## check if indexed and/or index if necessary external_kma_dbs_list = [] ## set defaults kma_bin = set_config.get_exe("kma") for f in kma_external_files: file_name = os.path.basename(f) fold_name = os.path.dirname(f) print(colored('\t\t+ %s' % file_name, 'green')) print() ## generate db databaseKMA = species_identification_KMA.generate_db( [f], file_name, fold_name, 'new', 'single', Debug, kma_bin) if not databaseKMA: print( colored( "***ERROR: Database provided is not indexed.\n" % databaseKMA, 'orange')) else: external_kma_dbs_list.append(databaseKMA) external_kma_dbs_string = ','.join(external_kma_dbs_list) option_db = "kma_external:" + external_kma_dbs_string else: ## rise error & exit print( colored( "***ERROR: No database provided via --kma_external_file option.\n", 'red')) exit() return (option_db)
def print_list_prokka(): """ Prints Prokka_ databases that has installed to use. It is the output from the call: .. code-block:: sh prokka --listdb .. include:: ../../devel/results/print_list_prokka.txt :literal: .. seealso:: This function depends on other BacterialTyper functions called: - :func:`BacterialTyper.scripts.set_config.get_exe` .. include:: ../../links.inc """ prokka_bin = set_config.get_exe('prokka') cmd = prokka_bin + " --listdb" HCGB_sys.system_call(cmd)
def BUSCO_run(sample_name, fasta, threads, output_name, dataset_name, mode, busco_db): my_out_folder = os.path.join(output_name, dataset_name + '/run_' + dataset_name) ## timestamp filename_stamp = my_out_folder + '/.success' print (colored("\tBUSCO Dataset [%s]; Sample [%s]" %(dataset_name, sample_name), 'yellow')) ## check previous run if os.path.isfile(filename_stamp): timestamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tSuccessfully run on date: %s" %timestamp, 'green')) else: busco_bin = set_config.get_exe('busco') os.chdir(output_name) ## init cmd configuration cmd = '%s -f -i %s -c %s --mode %s --download_path %s ' %(busco_bin, fasta, threads, mode, busco_db) ## options if autolineage or given dataset if "auto-lineage" == dataset_name: logFile = 'auto_lineage.log' cmd = cmd + '--auto-lineage -o %s > %s' %(dataset_name, logFile) else: logFile = dataset_name + '.log' cmd = cmd + '-l %s -o %s > %s' %(dataset_name, dataset_name, logFile) ## system call HCGB_sys.system_call(cmd) if os.path.isfile(my_out_folder + '/short_summary.txt'): ## timestamp HCGB_time.print_time_stamp(filename_stamp) else: print (colored("BUSCO failed: Dataset [%s]; Sample [%s]" %(dataset_name, fasta), 'red')) return ('FAIL') return()
def print_available_BUSCO(): HCGB_aes.print_sepLine("-", 100, False) busco_bin = set_config.get_exe('busco') ## get datasets busco_bin_call = busco_bin + ' --list-datasets > tmp' HCGB_sys.system_call(busco_bin_call, message=False) ## dump in screen with open("./tmp", 'r') as f: print(f.read()) ## clean list_files = HCGB_main.get_fullpath_list("./busco_downloads", False) list_files + ['tmp'] for i in list_files: os.remove(i) os.rmdir("./busco_downloads/information") os.rmdir("./busco_downloads/") HCGB_aes.print_sepLine("-", 100, False) print ("\n")
def print_dependencies(): """ """ progs = {} depencencies_pd = read_dependencies() for prog in depencencies_pd: #print (prog) prog_exe = set_config.get_exe(prog) #print (prog + '\t' + prog_exe) prog_ver = get_version(prog, prog_exe) progs[prog] = [prog_exe, prog_ver] df_programs = pd.DataFrame.from_dict(progs, orient='index', columns=('Executable path', 'Version')) df_programs = df_programs.stack().str.lstrip().unstack() pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print(df_programs)
def install_R_packages(package, source, install_path, extra): (install_R, install_github_package) = get_install_R_files() HCGB_files.create_folder(install_path) Rscript_exe = set_config.get_exe('Rscript') print("+ Installing %s package..." %package) install_file = install_R if (source == 'github'): install_file = install_github_package package= extra + '/' + package cmd_R = '%s %s -l %s -p %s' %(Rscript_exe, install_file, package, install_path) HCGB_sys.system_call(cmd_R) ## check if exists or try to install MLSTar_package = os.path.join(install_path, 'MLSTar') if os.path.exists(MLSTar_package): RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'R', 'R_package.info.txt') HCGB_main.printList2file(RDir_package, [install_path]) else: print_error_message(package, "No R package found", 'package') print ('Please install manually to proceed...')
def install_git_repo(git_repo, folder_sofware, install_path, option, Debug): """ """ ## current path current_path = os.getcwd() os.chdir(install_path) ## git clone repo print ('+ Using git to get code...') git_exe = set_config.get_exe('git', Debug) if os.path.exists(folder_sofware): print ('+ Clone repository...') ## pull os.chdir(folder_sofware) cmd = git_exe + ' pull' else: print ('+ Clone repository...') ## clone cmd = git_exe + ' clone ' + git_repo ## call git HCGB_sys.system_call(cmd) ## compile if necessary if (option == 'make'): ## Compile print ('+ Compile software...') ## make os.chdir(folder_sofware) HCGB_sys.system_call('make') ## chdir to previous path os.chdir(current_path) return(True)
def get_MLSTar_package_installed(debug=False): install_path = set_config.R_package_path_installed() (check_install_system, check_install_path) = set_config.get_check_R_files() R_script_exe = set_config.get_exe('Rscript') if debug: print('\n+ Check package: MLSTar') ## ATTENTION: optparse library missing, no installation withn conda ## first try to check if package available in system cmd_check = R_script_exe + ' ' + check_install_system + ' -l MLSTar' code = functions.system_call(cmd_check, message=False, returned=False) if (code == 'OK'): return ('system') else: ## check if installed in path cmd_check_path = R_script_exe + ' ' + check_install_path + ' -l MLSTar -p ' + install_path code2 = functions.system_call(cmd_check_path, message=False, returned=False) if (code2 == 'OK'): return (install_path) else: (install_R, install_github_package ) = install_dependencies.get_install_R_files() cmd_R = '%s %s -l iferres/MLSTar -p %s' % ( R_script_exe, install_github_package, install_path) code3 = functions.system_call(cmd_R, message=False, returned=False) if (code3): return (install_path) else: print('ERROR') exit()
def snippy_call(reference_fasta, list_files, threads, outdir, name, contig_option, other_options, Debug): """ Creates variant calling for a sample vs. a reference. By default, it uses ``rgid`` option with argument ``name`` provided. Argument ``list_files`` contains files to map that could be a single end file, two paired-end fastq files or a contig file. If fasta contig files provided, set ``contig_option`` True. All output files within ``outdir`` folder would containg tag ``snps`` :param reference_fasta: Absolute path to reference fasta file. :param list_files: List of absolute path to fastq files (.fq / .fq.gz / fastq / fastq.gz) :param threads: Number of CPU cores to use. :param outdir: Output folder. :param name: Name of the sample :param contig_option: True/false to map contigs provided instead of files. Contigs provided via list_files. :param other_options: String of options to include in snippy call :param Debug: True/false for debugging messages :type reference_fasta: string :type list_files: list :type threads: int :type outdir: string :type name: string :type contig_options: bool :type other_options: string :type Debug: bool """ ## create snippy call snippy_exe = set_config.get_exe('snippy', Debug) ## start snippy_cmd log_file = os.path.join(outdir, "snippy_cmd.log") snippy_cmd = '%s --cpus %s --reference %s --force --unmapped --outdir %s --rgid %s' %( snippy_exe, threads, reference_fasta, outdir, name) ## force option: prevent finish early if folder exists ## unmapped option: keep unmapped reads ## add files to map if contig_option: snippy_cmd = snippy_cmd + ' --ctgs ' + list_files[0] else: if (len(list_files) == 1): snippy_cmd = snippy_cmd + ' --se ' + list_files[0] elif (len(list_files) == 2): snippy_cmd = snippy_cmd + ' --pe1 ' + list_files[0] + ' --pe2 ' + list_files[1] else: print(colored("** ERROR: No reads or contigs provided...", "red")) return(False) ## add log snippy_cmd = snippy_cmd + ' 2> ' + log_file ## debug message if (Debug): print (colored("**DEBUG: snippy_cmd **", 'yellow')) print (snippy_cmd) ## create system call return(HCGB_sys.system_call(snippy_cmd, returned=False, message=True))
def GI_module(genbank_file, name, outdir, Debug, cutoff_dinuc_bias=8, min_length=1000): """Identify genomic islands (GI) within the genbank file provided. They are calculated based on gene annotation and dinucleotide bias region using the software `IslandPath-DIMOB`_. :param genbank_file: Absolute path to annotation file in Genbank format. :param name: Sample identifier. :param outdir: Absolute path to output folder. :param cutoff_dinuc_bias: Dinucleotide bias cutoff :param min_length: Minimun length for the regions to be reported :type name: string :type genbank_file: string :type outdir: string :type cutoff_dinuc_bias: int :type min_length: int The Dimob.pl perl script has two mandatory argument which are the input :file:`genbank_file` and an output name. .. code-block:: sh Usage: perl Dimob.pl <genome.gbk> <output_name> [cutoff_dinuc_bias] [min_length] Default values: cutoff_dinuc_bias = 8 min_length = 8000 Example: perl Dimob.pl example/NC_003210.gbk NC_003210_GIs perl Dimob.pl example/NC_003210.gbk NC_003210_GIs 6 10000 perl Dimob.pl example/NC_000913.embl NC_000913_GIs 6 10000 During the development of BacterialTyper, we generated a modification of the original `IslandPath-DIMOB`_ to analyze contig sequence data and generated different output format for better clarificaiton and interpretaion of results. We forked the original code into a new git repository and update the code accordingly. See details here: https://github.com/JFsanchezherrero/islandpath. .. include:: ../../links.inc """ ## filename stamp of the process filename_stamp = outdir + '/.Dimob' # check if previously done if os.path.isfile(filename_stamp): stamp = functions.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s -- Dimob]" %(stamp, name), 'yellow')) else: ## debug message if (Debug): print (colored("**DEBUG: Call Dimob for sample %s " %name + "**", 'yellow')) print ("genbank_file", genbank_file) print ("outdir: ", outdir) ## Call IslandPath Dimob executable perl file. dimob_pl = set_config.get_exe("dimob", Debug) perl_exe = set_config.get_exe("perl", Debug) ## command outdir_sample = os.path.join(outdir, name) log_file = outdir_sample + '.log' perl_cmd = '%s %s %s %s %s %s > %s' %(perl_exe, dimob_pl, genbank_file, outdir_sample, cutoff_dinuc_bias, min_length, log_file) code = functions.system_call(perl_cmd) ## if code: ## when finished print time stamp in output + '/.Dimob' stamp = functions.print_time_stamp(filename_stamp) else: return False return (outdir)
def generate_xtract_call(docsum_file, pattern, element, outfile): xtract_bin = set_config.get_exe("xtract") return (xtract_call(docsum_file, pattern, element, outfile, xtract_bin))
def generate_docsum_call(db, query, outfile): esearch_bin = set_config.get_exe("esearch") efetch_bin = set_config.get_exe("efetch") return (docsum_call(db, query, outfile, esearch_bin, efetch_bin))
def agrvate_call(sample, assembly_file, folder, debug=False): """agrvate call and check results.""" ## prepare call log_call = os.path.join(folder, "agrvate_cmd.log") err_call = os.path.join(folder, "agrvate_cmd.err") agrvate_bin = set_config.get_exe('agrvate') ## system call cmd_call = "%s -i %s -m -f > %s 2> %s " %(agrvate_bin, assembly_file, log_call, err_call) ## use mummer (-m) and force results folder (-f) status = HCGB_sys.system_call(cmd_call) ## check results ## see https://github.com/VishnuRaghuram94/AgrVATE#results for additional details results = pd.DataFrame() ## check folder is created assembly_file_name = os.path.basename(assembly_file).split('.fna')[0] original_results_folder = os.path.join(folder, assembly_file_name + '-results') results_folder = os.path.join(folder, 'agrvate_results') if os.path.isdir(original_results_folder): print("+ Results folder generated OK") print("+ Check results generated:") ## rename folder os.rename(original_results_folder, results_folder) os.rename(os.path.join(folder, assembly_file_name + '.fna-error-report.tab'), os.path.join(results_folder, 'error_report.tab')) ## write to excel file_name_Excel = os.path.join(folder, sample + '_agr_results.xlsx') writer_Excel = pd.ExcelWriter(file_name_Excel, engine='xlsxwriter') ## open excel handle ## get all files list_files = HCGB_main.get_fullpath_list(results_folder) ## summary tab summary_tab_file = [s for s in list_files if s.endswith("summary.tab")][0] summary_tab = HCGB_main.get_data(summary_tab_file, '\t', options="") summary_tab['sample'] = sample ## columns #agr_group: gp1/gp2/gp3/gp4. 'u' means unknown. ## If multiple agr groups were found (col 5 = m), ## the displayed agr group is the majority/highest confidence. # match_score: maximum 15; 0 means untypeable; < 5 means low confidence. # canonical_agrD: 1 means canonical; 0 means non-canonical; u means unknown. # multiple_agr: s means single, m means multiple, u means unknown ) ## Multiple groups are found likely due to multiple S. aureus isolates in sequence # frameshifts: Number found in CDS of extracted agr operon ('u' if agr operon not extracted) ## debug messages if debug: HCGB_aes.debug_message("agrvate results: Summary tab file", 'yellow') print(summary_tab_file) print(summary_tab) ## add summary results to all results del summary_tab['#filename'] results = summary_tab.copy() ## save summary_tab into excel ## tab summary summary_tab.to_excel(writer_Excel, sheet_name='summary') ## write excel handle ## agr_gp tab agr_gp_tab_file = [s for s in list_files if s.endswith("agr_gp.tab")][0] if HCGB_files.is_non_zero_file(agr_gp_tab_file): agr_gp_tab = HCGB_main.get_data(agr_gp_tab_file, '\t', options='header=None') agr_gp_tab.columns = ['contig', 'agr', 'evalue', 'identity', 'start', 'end'] agr_gp_tab['sample'] = sample ## columns ## Assembly Contig ID ## ID of matched agr group kmer ## evalue ## Percentage identity of match ## Start position of kmer alignment on input sequence ## End position of kmer alignment on input sequence ## debug messages if debug: HCGB_aes.debug_message("agrvate results: agr_gp file", 'yellow') print(agr_gp_tab_file) print(agr_gp_tab) ## save agr_gp_tab file into excel ## tab operon agr_gp_tab.to_excel(writer_Excel, sheet_name='operon') ## write excel handle ## agr_operon fna try: agr_operon_fna_file = [s for s in list_files if s.endswith("agr_operon.fna")][0] ## debug messages if debug: HCGB_aes.debug_message("agrvate results: agr_operon file", 'yellow') print(agr_operon_fna_file) results['operon_fna'] = agr_operon_fna_file except: results['operon_fna'] = '' ## agr_operon fna error_report_file = [s for s in list_files if s.endswith("error_report.tab")][0] error_report = HCGB_main.get_data(error_report_file, '\t', options="") del error_report['#input_name'] ## debug messages if debug: HCGB_aes.debug_message("agrvate results: error_report.tab file", 'yellow') print(error_report_file) print(error_report) ## save error_report file into excel ## tab steps error_report.to_excel(writer_Excel, sheet_name='steps') ## write excel handle ## merge results results = pd.concat([results, error_report], axis=1) ## close xlsx file writer_Excel.save() ## close excel handle ## add to pandas dataframe results['agr_operon_xlsx'] = file_name_Excel ## debug messages if debug: HCGB_aes.debug_message("agrvate results", 'yellow') HCGB_main.print_all_pandaDF(results) return (results)
def run(options): """ This is the main function of the module ``config``. It basically checks if the different requirements (python` and third-party software) are fulfilled. If any requirement is not available this modules tries to install them or reports to the user to manually install them. :param option: State whether to check or install missing modules, packages and third party software. Provide: check/install :param install_path: Absolute path to install modules or packages missing. Default: ``BacterialTyper`` environment path. :param IslandPath: True/False for checking additional perl and software required by this option analysis. :param debug: True/false for debugging messages. :type option: string :type IslandPath: boolean :type install_path: string :type debug: boolean .. seealso:: This function depends on several ``BacterialTyper`` functions: - :func:`BacterialTyper.config.set_config.check_python_packages` - :func:`BacterialTyper.config.set_config.check_perl_packages` - :func:`BacterialTyper.config.extern_progs.return_min_version_soft` - :func:`BacterialTyper.config.extern_progs.print_dependencies` """ ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Pipeline Configuration") print("--------- Starting Process ---------") HCGB_time.print_time() if (options.install_path): if os.path.isdir(options.install_path): if (Debug): print( "Installation path provided for missing modules, packages, dependencies..." ) print("Path: " + options.install_path) else: print(colored("\n*** ERROR ****", 'red')) print(colored("Path provided is not a folder", 'red')) print(options.install_path) exit() else: ## get python environment path env_bin_directory = os.path.dirname(os.environ['_']) ##os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'templates')) options.install_path = os.path.abspath( os.path.join(env_bin_directory, '../software')) if (Debug): print("Retrieve environment path as installation path:") print("Path: " + options.install_path) HCGB_files.create_folder(options.install_path) ####################### ## install or only check ####################### option_install = False if (options.option == 'install'): print("\n+ Check dependencies") print( "+ Try to install all missing dependencies, modules or third party software..." ) option_install = True ## check if access and permission if os.path.isdir(options.install_path): if (set_config.access_check(options.install_path, mode=os.F_OK)): print( "Installation path is accessible and has permission for installation if necessary" ) else: print(colored("\n*** ERROR ****", 'red')) print( colored( "No access/permission for this path: %s" % options.install_path, 'red')) print( colored( "Please provide a valid path with access/permission to install any missing dependencies.", 'red')) exit() else: print(colored("\n*** ERROR ****", 'red')) print(colored("Path provided is not a folder", 'red')) print(options.install_path) exit() elif (options.option == 'only_check'): print( "\nCheck dependencies, modules or third party software and print report..." ) ####################### ## python version ####################### HCGB_aes.print_sepLine("+", 20, False) print('Python:') HCGB_aes.print_sepLine("+", 20, False) this_python_version = str(sys.version) python_min_version = extern_progs.return_min_version_soft('python') if LooseVersion(this_python_version) >= LooseVersion(python_min_version): print( colored( "Minimum version (%s) satisfied: %s" % (python_min_version, this_python_version), 'green')) else: print( colored( "Minimum version (%s) not satisfied: %s" % (python_min_version, this_python_version), 'red')) exit() ####################### ## perl_version ####################### print('\n') HCGB_aes.print_sepLine("+", 50, False) print('Perl:') HCGB_aes.print_sepLine("+", 50, False) perl_min_version = extern_progs.return_min_version_soft('perl') this_perl_path = set_config.get_exe("perl", Debug) this_perl_version = set_config.get_version("perl", this_perl_path, Debug) if LooseVersion(this_perl_version) >= LooseVersion(perl_min_version): print( colored( "Minimum version (%s) satisfied: %s" % (perl_min_version, this_perl_version), 'green')) else: print( colored( "Minimum version (%s) not satisfied: %s" % (perl_min_version, this_perl_version), 'red')) exit() ####################### ## third-party software ####################### print('\n') HCGB_aes.print_sepLine("+", 20, False) print('External dependencies:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_dependencies(option_install, options.install_path, Debug) print('\n') ####################### ## python packages ####################### print('\n') HCGB_aes.print_sepLine("+", 20, False) print('Python packages:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_python_packages(Debug, option_install, options.install_path) HCGB_aes.print_sepLine("+", 20, False) print('\n') ####################### ## perl packages ####################### print('\n') HCGB_aes.print_sepLine("+", 20, False) print('Perl packages:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_perl_packages("perl_dependencies", Debug, option_install, options.install_path) HCGB_aes.print_sepLine("+", 20, False) print('\n') ####################### ## IslandPath dependencies ####################### if (options.IslandPath): print('\n') HCGB_aes.print_sepLine("+", 20, False) print('IslandPath packages and software required:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_IslandPath(Debug, option_install, options.install_path) HCGB_aes.print_sepLine("+", 20, False) print('\n') ####################### ## R packages ####################### print('\n') HCGB_aes.print_sepLine("+", 20, False) print('R packages:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_R_packages(option_install, options.install_path, Debug) HCGB_aes.print_sepLine("+", 20, False) print('\n')
def run_database(options): ## init time start_time_total = time.time() start_time_partial = start_time_total ## debugging messages global Debug if (options.debug): Debug = True print("[Debug mode: ON]") else: Debug = False ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Database") print("--------- Starting Process ---------") HCGB_time.print_time() kma_bin = set_config.get_exe("kma") ###################################################### ## print further information if requested if (options.help_ARIBA): print("ARIBA databases information:") ariba_caller.help_ARIBA() exit() elif (options.help_BUSCO): BUSCO_caller.print_help_BUSCO() exit() elif (options.help_KMA): species_identification_KMA.help_kma_database() exit() ###################################################### ## create folder ## absolute options.path = os.path.abspath(options.path) HCGB_files.create_folder(options.path) ######### if Debug: print(colored("DEBUG: absolute path folder: " + options.path, 'yellow')) ########## ## NCBI ## ########## ## if any NCBI options provided if any([options.ID_file, options.descendant]): ## create folders NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path) if (options.ID_file): ## get path and check if it is file abs_path_file = os.path.abspath(options.ID_file) if os.path.isfile(abs_path_file): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check NCBI ids provided ---------\n") HCGB_aes.print_sepLine("*", 70, False) ## get file information print("\t+ Obtaining information from file: %s" % abs_path_file) strains2get = HCGB_main.get_data(abs_path_file, ',', '') dataBase_NCBI = database_generator.NCBI_DB( strains2get, NCBI_folder, Debug) ######### if Debug: print(colored("DEBUG: NCBI data provided: ", 'yellow')) print(options.ID_file) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## strains downloaded would be included to a kma index ## Get all entries belonging to this taxon provided if (options.descendant): ######### if Debug: print(colored("DEBUG: NCBI descendant option: ON ", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print( "--------- Check descendant NCBI taxonomy ids provided ---------\n" ) HCGB_aes.print_sepLine("*", 70, False) ## [TODO] dataBase_NCBI = database_generator.NCBI_descendant( options.descendant, NCBI_folder, Debug) ############################################################## ## update KMA database with NCBI information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_NCBI['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db) print('+ Database to update: ', genbank_kma_db) species_identification_KMA.generate_db(list_of_files, 'genbank_KMA', genbank_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ############### ## user_data ## ############### if options.project_folder: ## dataBase_user = pd.DataFrame() ## get absolute path abs_project_folder = os.path.abspath(options.project_folder) if os.path.exists(abs_project_folder): ######### if Debug: print( colored("DEBUG: User provides folder containing project", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print("--------- Check user provided project folder ---------") HCGB_aes.print_sepLine("*", 70, False) dataBase_user = database_user.update_database_user_data( options.path, abs_project_folder, Debug, options) else: print( colored( "ERROR: Folder provided does not exists: %s" % options.project_folder, 'red')) exit() ############################################################## ## update KMA database with user_data information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_user['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) user_kma_db = HCGB_files.create_subfolder('user_data', kma_db) print('+ Database to update: ', user_kma_db) species_identification_KMA.generate_db(list_of_files, 'userData_KMA', user_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ########## ## ARIBA ########## print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check ARIBA parameters provided --------") HCGB_aes.print_sepLine("*", 50, False) if (options.no_ARIBA): print("+ No ARIBA databases would be downloaded...") ######### if Debug: print(colored("DEBUG: No option ARIBA", 'yellow')) else: #functions.print_sepLine("*",50, False) ### ariba list databases ariba_dbs_list = ['CARD', 'VFDB'] if (options.no_def_ARIBA): ariba_dbs_list = options.ariba_dbs else: if (options.ariba_dbs): ariba_dbs_list = ariba_dbs_list + options.ariba_dbs ariba_dbs_list = set(ariba_dbs_list) ######### if Debug: print(colored("DEBUG: Option ARIBA", 'yellow')) print(options.ariba_dbs) ariba_caller.download_ariba_databases(ariba_dbs_list, options.path, Debug, options.threads) ### ariba list databases if (options.ariba_users_fasta): print( "+ Generate ARIBA database for databases provided: prepare fasta and metadata information" ) ######### if Debug: print(colored("DEBUG: Option user ARIBA db", 'yellow')) print(ariba_users_fasta) print(ariba_users_meta) ## [TODO]: ## ariba prepareref fasta and metadata ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######### ## kma ## ######### print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check KMA parameters provided ----------") kma_database = options.path + '/KMA_db' HCGB_files.create_folder(kma_database) ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains ## downloads all "bacterial" genomes from KMA website ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/ print( "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website" ) ## KMA databases to use ## only user dbs if (options.no_def_kma): if (options.kma_dbs): print("+ Only user databases selected will be indexed...") else: print("+ No databases selected.") print(colored("ERROR: Please select a kma database.", 'red')) exit() ## default dbs + user else: kma_dbs = ["bacteria", "plasmids"] ## default dbs + user if (options.kma_dbs): options.kma_dbs = options.kma_dbs + kma_dbs options.kma_dbs = set(options.kma_dbs) else: options.kma_dbs = kma_dbs ######### if Debug: print(colored("DEBUG: options.kma_dbs", 'yellow')) print(options.kma_dbs) ## Get databases for db in options.kma_dbs: print(colored("\n+ " + db, 'yellow')) db_folder = HCGB_files.create_subfolder(db, kma_database) species_identification_KMA.download_kma_database(db_folder, db, Debug) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ########### ## BUSCO ## ########### if (options.BUSCO_dbs): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check BUSCO datasets provided ---------") BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path) ######### if Debug: print(colored("DEBUG: options.BUSCO_dbs", 'yellow')) print(options.BUSCO_dbs) print("+ BUSCO datasets would be downloaded when executed...") #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print("\n*************** Finish *******************\n") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Database module.\n") return ()
def MLST_ident(options, dataFrame, outdir_dict, dataFrame_edirect, retrieve_databases): """Generate MLST profile identification This functions uses the `MLSTar software`_ to retrieve Multi locus sequence typing (MLST) profiles from PubMLST_ for the given species previously identified by KMA. It generates MLST profiling for each sample. :param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in... :param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :param dataFrame_edirect: pandas dataframe resulted from :func:`BacterialTyper.modules.ident.edirect_ident`. :param retrieve_databases: :type options: :type dataFrame: pandas.DataFrame() :type outdir_dict: Dictionary :type dataFrame_edirect: pandas.DataFrame() :type retrieve_databases: pandas.DataFrame() :return: Information of the MLST identification. Dictionary keys are samples and values are the absolute path to file generate by :func:`BacterialTyper.scripts.MLSTar.run_doMLST` containing MLST information. :rtype: Dictionary See example of returned dataframe in file :file:`/devel/results/doMLST_result_example.csv` here: .. include:: ../../devel/results/doMLST_result_example.csv :literal: .. seealso:: Additional information to PubMLST available datasets. - :doc:`PubMLST datasets<../../../data/PubMLST_datasets>` .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.functions.create_subfolder` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.scripts.MLSTar.run_MLSTar` - :func:`HCGB.sampleParser.files.get_files` - :func:`BacterialTyper.scripts.MLSTar.get_MLSTar_species` .. include:: ../../links.inc """ ## set config rscript = set_config.get_exe("Rscript") ## TODO: Samples might not be assembled...to take into account and return 0 ## TODO: Fix and install MLSTar during installation print(MLSTar.get_MLSTar_package_installed()) exit() ######################################################################################## ## TODO: What to do if multi-isolate sample? ## TODO: Control if a different profile is provided via --MLST_profile ## TODO: Check time passed and download again if >?? days passed] ## debug message if (Debug): print(colored("**DEBUG: dataFrame_edirect identified**", 'yellow')) print(dataFrame_edirect) ## MLST call HCGB_aes.boxymcboxface("MLST typing") print( "+ Create classical MLST typification of each sample according to species retrieved by kmer..." ) ## get assembly files input_dir = os.path.abspath(options.input) assembly_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: assembly_samples_retrieved**", 'yellow')) print(assembly_samples_retrieved) # init MLST_results = {} ## get MLST_profile: default or provided mlst_profile_list = retrieve_databases.loc[retrieve_databases['db'] == 'PubMLST']['path'].tolist() if (Debug): print("** Debug **") print("mlst_profile_list") print(mlst_profile_list) print("dataFrame_edirect") print(dataFrame_edirect) ## Generate MLST call according to species identified for each sample for index, row in dataFrame_edirect.iterrows(): MLSTar_taxa_name = MLSTar.get_MLSTar_species(row['genus'], row['species']) if (MLSTar_taxa_name == 'NaN'): print( colored( "\t- Not available PubMLST profile for sample [%s] identified as %s %s" % (row['sample'], row['genus'], row['species']), 'yellow')) else: for mlst_profile in mlst_profile_list: ## species folder #species_mlst_folder = functions.create_subfolder(MLSTar_taxa_name, pubmlst_folder) species_mlst = mlst_profile.split(',')[0] species_mlst_folder = mlst_profile.split(',')[1] ## output file output_file = species_mlst_folder + '/PubMLST_available_scheme.csv' filename_stamp = species_mlst_folder + '/.success_scheme' ## if MLSTar_taxa_name == species_mlst: if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s" % stamp, 'yellow')) else: ### get scheme available MLSTar.getPUBMLST(MLSTar_taxa_name, rscript, output_file) stamp = HCGB_time.print_time_stamp(filename_stamp) ## parse and get scheme for classical MLST schemes_MLST = pd.read_csv(output_file, sep=',', header=0) ## for item, cluster in schemes_MLST.iterrows(): if cluster['len'] < 10: scheme2use = int(cluster['scheme']) continue ### sample = row['sample'] MLSTar_folder = HCGB_files.create_subfolder( 'MLST', outdir_dict[sample]) genome_file = assembly_samples_retrieved.loc[ assembly_samples_retrieved['name'] == sample]['sample'].values[0] ## call MLST (results, profile_folder) = MLSTar.run_MLSTar( species_mlst_folder, rscript, MLSTar_taxa_name, scheme2use, sample, MLSTar_folder, genome_file, options.threads) MLST_results[sample] = results ## print("+ Finish this step...") return (MLST_results)
def send_kma_job(outdir_file, list_files, name, database, threads, Debug): """ Executes KMA identification jobs This function automates the process of checking if any previous run succeeded or runs the appropiate identification process for the sample and database provided. :param outdir_file: :param list_files: :param name: :param database: :param threads: :param dataFrame_sample: :type outdir_file: :type list_files: :type name: :type database: :type threads: :type dataFrame_sample: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.config.set_config.get_exe` - :func:`BacterialTyper.scripts.species_identification_KMA.kma_ident_call` - :func:`BacterialTyper.module.ident.get_outfile` - :func:`BacterialTyper.scripts.functions.read_time_stamp` """ if (Debug): print(colored("**DEBUG: ident.send_kma_job call**", 'yellow')) print("outdir_file") print(outdir_file) print("list_files") print(list_files) print("name: " + name) print("database: " + database) ## outdir_KMA outdir_dict_kma = HCGB_files.create_subfolder("kma", outdir_file) ## set defaults kma_bin = set_config.get_exe("kma") ## get outfile outfile = get_outfile(outdir_dict_kma, name, database) ## check if previously run and succeeded basename_tag = os.path.basename(outfile) filename_stamp = outdir_dict_kma + '/.success_' + basename_tag if (Debug): print("Outdir: ", outdir_dict_kma) print("outfile: ", outfile) print("Filename_stamp: ", filename_stamp) if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) else: ## debug message if (Debug): print( colored( "**DEBUG: species_identification_KMA.kma_ident_module call**", 'yellow')) print("outfile = get_outfile(outdir_dict_kma, name, db2use)") print("outfile: ", outfile) print( "species_identification_KMA.kma_ident_module(outfile, list_files, name, database, threads) " ) print("species_identification_KMA.kma_ident_module" + "\t" + outfile + "\t" + str(list_files) + "\t" + name + "\t" + database + "\t" + str(threads) + "\n") ## Sparse or not #if any(name in basename_tag for name in ['userData_KMA', 'genbank_KMA']): # if (basename_tag == 'userData_KMA'): # option = '' # else: # option = '-Sparse ' ## Add option to retrieve databse from memory option = "" option = option + '-shm 1' # Call KMA species_identification_KMA.kma_ident_call(outfile, list_files, name, database, kma_bin, option, threads) stamp = HCGB_time.print_time_stamp(filename_stamp)
def KMA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, time_partial): """Kmer identification using software KMA_. :param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in... :param pd_samples_retrieved: pandas dataframe for samples to process. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :param retrieve_databases: :param time_partial: timestamp of start time of the process. :type options: :type pd_samples_retrieved: pandas.DataFrame() :type outdir_dict: Dictionary :type retrieve_databases: pandas.DataFrame() :type time_partial: :return: Information of the identification. See example below. :rtype: pandas.DataFrame() See example of returned dataframe in file :file:`/devel/results/KMA_ident_example.csv` here: .. include:: ../../devel/results/KMA_ident_example.csv :literal: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.config.set_config.get_exe` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.modules.ident.send_kma_job` - :func:`BacterialTyper.modules.ident.get_outfile` - :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed` - :func:`BacterialTyper.scripts.species_identification_KMA.parse_kma_results` .. include:: ../../links.inc """ return (pd.DataFrame()) ### print header HCGB_aes.boxymcboxface("KMA Identification") ## set defaults kma_bin = set_config.get_exe("kma") ## check status databases2use = [] for index, db2use in retrieve_databases.iterrows(): ## index_name if (str(db2use['source']).startswith('KMA')): print('+ Check database: ' + db2use['db']) fold_name = os.path.dirname(db2use['path']) index_status = species_identification_KMA.check_db_indexed( db2use['path'], fold_name) if (index_status == True): print( colored( "\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green')) databases2use.append(db2use['path']) else: #databases2use.remove(db2use) print( colored( "\t**Databases %s is not correctly indexed. Not using it...\n" % db2use['db'], 'red')) ## debug message if (Debug): print( colored( "**DEBUG: databases2use\n" + "\n".join(databases2use) + "\n**", 'yellow')) ## Start identification of samples print("\n+ Send KMA identification jobs...") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: for db2use in databases2use: ## load database on memory print("+ Loading database on memory for faster identification.") return_code_load = species_identification_KMA.load_db( kma_bin, db2use) ## send for each sample commandsSent = { executor.submit(send_kma_job, outdir_dict[name], sorted(cluster["sample"].tolist()), name, db2use, threads_job, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## remove database from memory print("+ Removing database from memory...") return_code_rm = species_identification_KMA.remove_db( kma_bin, db2use) if (return_code_rm == 'FAIL'): print( colored( "***ERROR: Removing database from memory failed. Please do it manually! Execute command: %s" % cmd_rm_db, 'red')) ## functions.timestamp time_partial = HCGB_time.timestamp(time_partial) ## parse results print("+ KMA identification call finished for all samples...") print("+ Parse results now") results_summary = pd.DataFrame() for db2use in databases2use: ### [TODO]: parse data according to database: bacteria, plasmids or user data or genbank data provided basename_db = os.path.basename(db2use) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) ### for name, cluster in sample_frame: ## get result ## outdir_KMA outdir_dict_kma = HCGB_files.create_subfolder( "kma", outdir_dict[name]) result = get_outfile(outdir_dict_kma, name, db2use) #print ('\t- File: ' + result + '.spa') ## get results using a cutoff value [Defaulta: 80] results = species_identification_KMA.parse_kma_results( result + '.spa', options.KMA_cutoff) results['Database'] = basename_db ### check if db2use is plasmids as it could be several. if (results.index.size > 1): if (basename_db == "plasmids.T" or basename_db == "viral.TG"): ## let it be several entries results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) else: print( colored("###########################################", 'yellow')) print( colored("Sample %s contains multiple strains." % name, 'yellow')) print( colored("###########################################", 'yellow')) print(colored(results, 'yellow')) print('\n\n') ## add both strains if detected results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) ## TODO: add multi-isolate flag elif (results.index.size == 1): ## 1 clear reference results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) else: print( colored( '\tNo clear strain from database %s has been assigned to sample %s' % (basename_db, name), 'yellow')) ## add empty line if no available results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) print("+ Finish this step...") ## debug message if (Debug): results_summary.to_csv(quotechar='"') return (results_summary)