def run_BacterialTyper(options): ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_info.help_fastq_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() elif (options.help_Prokka): ## information for Prokka annotation.print_list_prokka() exit() elif (options.help_Mash): ## information for Min Hash Software min_hash_caller.helpMash() exit() elif (options.help_ARIBA): ## information for ARIBA ariba_caller.help_ARIBA() exit() elif (options.help_trimm_adapters): ## help on trimm adapters trimmomatic_call.print_help_adapters() exit() elif (options.help_KMA): ## information for KMA Software species_identification_KMA.help_kma_database() exit() elif (options.help_MLSTar): ## information for KMA Software MLSTar.help_MLSTar() exit() elif (options.help_PhiSpy): ## information for PhiSpy software bacteriophage.help_PhiSpy() exit() elif (options.help_MGE_analysis): ## information for MGE module analysis MGE.help_MGE_analysis() exit() elif (options.help_input_MGE): ## information for PhiSpy MGE.help_input_MGE() exit() ### HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("BacterialTyper analysis") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.project): outdir = input_dir elif (options.detached): outdir = os.path.abspath(options.output_folder)
def run_assembly(options): """Main function of the assemble module. It assembles each sample using SPADES_ and checks quality using BUSCO_ software and database. .. seealso:: This function depends on other BacterialTyper and HCGB functions called: - :func:`BacterialTyper.scripts.BUSCO_caller.print_help_BUSCO` - :func:`BacterialTyper.scripts.multiQC_report.multiqc_help` - :func:`BacterialTyper.modules.qc.BUSCO_check` - :func:`HCGB.sampleParser` - :func:`HCGB.functions.aesthetics_functions` - :func:`HCGB.functions.time_functions` - :func:`HCGB.functions.main_functions` - :func:`HCGB.functions.file_functions` .. include:: ../../links.inc """ ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_info.help_fastq_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() exit() ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly module") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "assemble", options.debug) ### call assemble using spades start_time_partial = start_time_total start_time_partial_assembly = start_time_partial ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): HCGB_aes.debug_message("options.threads: " + str(options.threads), "yellow") HCGB_aes.debug_message("max_workers: " + str(max_workers_int), "yellow") HCGB_aes.debug_message("cpu_here: " + str(threads_job), "yellow") # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) # We can use a with statement to ensure threads are cleaned up promptly print('+ Running modules SPADES...') with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: ## send for each sample commandsSent = { executor.submit(check_sample_assembly, name, outdir_dict[name], sorted(cluster["sample"].tolist()), threads_job): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## functions.timestamp print("\n+ Assembly of all samples finished: ") start_time_partial = HCGB_time.timestamp(start_time_partial_assembly) ## if (assembly_stats): ################### if Debug: HCGB_aes.debug_message("assembly_stats dictionary", "yellow") print(assembly_stats) ## create single file get_assembly_stats_all(assembly_stats, outdir, Debug) ### symbolic links print("+ Retrieve all genomes assembled...") ### BUSCO check assembly if (options.no_BUSCO): print() else: results = qc.BUSCO_check(outdir, outdir, options, start_time_partial, "genome") ## print to file results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Assembly module.") return ()
def run_database(options): ## init time start_time_total = time.time() start_time_partial = start_time_total ## debugging messages global Debug if (options.debug): Debug = True print("[Debug mode: ON]") else: Debug = False ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Database") print("--------- Starting Process ---------") HCGB_time.print_time() kma_bin = set_config.get_exe("kma") ###################################################### ## print further information if requested if (options.help_ARIBA): print("ARIBA databases information:") ariba_caller.help_ARIBA() exit() elif (options.help_BUSCO): BUSCO_caller.print_help_BUSCO() exit() elif (options.help_KMA): species_identification_KMA.help_kma_database() exit() ###################################################### ## create folder ## absolute options.path = os.path.abspath(options.path) HCGB_files.create_folder(options.path) ######### if Debug: print(colored("DEBUG: absolute path folder: " + options.path, 'yellow')) ########## ## NCBI ## ########## ## if any NCBI options provided if any([options.ID_file, options.descendant]): ## create folders NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path) if (options.ID_file): ## get path and check if it is file abs_path_file = os.path.abspath(options.ID_file) if os.path.isfile(abs_path_file): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check NCBI ids provided ---------\n") HCGB_aes.print_sepLine("*", 70, False) ## get file information print("\t+ Obtaining information from file: %s" % abs_path_file) strains2get = HCGB_main.get_data(abs_path_file, ',', '') dataBase_NCBI = database_generator.NCBI_DB( strains2get, NCBI_folder, Debug) ######### if Debug: print(colored("DEBUG: NCBI data provided: ", 'yellow')) print(options.ID_file) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## strains downloaded would be included to a kma index ## Get all entries belonging to this taxon provided if (options.descendant): ######### if Debug: print(colored("DEBUG: NCBI descendant option: ON ", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print( "--------- Check descendant NCBI taxonomy ids provided ---------\n" ) HCGB_aes.print_sepLine("*", 70, False) ## [TODO] dataBase_NCBI = database_generator.NCBI_descendant( options.descendant, NCBI_folder, Debug) ############################################################## ## update KMA database with NCBI information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_NCBI['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db) print('+ Database to update: ', genbank_kma_db) species_identification_KMA.generate_db(list_of_files, 'genbank_KMA', genbank_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ############### ## user_data ## ############### if options.project_folder: ## dataBase_user = pd.DataFrame() ## get absolute path abs_project_folder = os.path.abspath(options.project_folder) if os.path.exists(abs_project_folder): ######### if Debug: print( colored("DEBUG: User provides folder containing project", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print("--------- Check user provided project folder ---------") HCGB_aes.print_sepLine("*", 70, False) dataBase_user = database_user.update_database_user_data( options.path, abs_project_folder, Debug, options) else: print( colored( "ERROR: Folder provided does not exists: %s" % options.project_folder, 'red')) exit() ############################################################## ## update KMA database with user_data information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_user['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) user_kma_db = HCGB_files.create_subfolder('user_data', kma_db) print('+ Database to update: ', user_kma_db) species_identification_KMA.generate_db(list_of_files, 'userData_KMA', user_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ########## ## ARIBA ########## print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check ARIBA parameters provided --------") HCGB_aes.print_sepLine("*", 50, False) if (options.no_ARIBA): print("+ No ARIBA databases would be downloaded...") ######### if Debug: print(colored("DEBUG: No option ARIBA", 'yellow')) else: #functions.print_sepLine("*",50, False) ### ariba list databases ariba_dbs_list = ['CARD', 'VFDB'] if (options.no_def_ARIBA): ariba_dbs_list = options.ariba_dbs else: if (options.ariba_dbs): ariba_dbs_list = ariba_dbs_list + options.ariba_dbs ariba_dbs_list = set(ariba_dbs_list) ######### if Debug: print(colored("DEBUG: Option ARIBA", 'yellow')) print(options.ariba_dbs) ariba_caller.download_ariba_databases(ariba_dbs_list, options.path, Debug, options.threads) ### ariba list databases if (options.ariba_users_fasta): print( "+ Generate ARIBA database for databases provided: prepare fasta and metadata information" ) ######### if Debug: print(colored("DEBUG: Option user ARIBA db", 'yellow')) print(ariba_users_fasta) print(ariba_users_meta) ## [TODO]: ## ariba prepareref fasta and metadata ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######### ## kma ## ######### print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check KMA parameters provided ----------") kma_database = options.path + '/KMA_db' HCGB_files.create_folder(kma_database) ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains ## downloads all "bacterial" genomes from KMA website ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/ print( "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website" ) ## KMA databases to use ## only user dbs if (options.no_def_kma): if (options.kma_dbs): print("+ Only user databases selected will be indexed...") else: print("+ No databases selected.") print(colored("ERROR: Please select a kma database.", 'red')) exit() ## default dbs + user else: kma_dbs = ["bacteria", "plasmids"] ## default dbs + user if (options.kma_dbs): options.kma_dbs = options.kma_dbs + kma_dbs options.kma_dbs = set(options.kma_dbs) else: options.kma_dbs = kma_dbs ######### if Debug: print(colored("DEBUG: options.kma_dbs", 'yellow')) print(options.kma_dbs) ## Get databases for db in options.kma_dbs: print(colored("\n+ " + db, 'yellow')) db_folder = HCGB_files.create_subfolder(db, kma_database) species_identification_KMA.download_kma_database(db_folder, db, Debug) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ########### ## BUSCO ## ########### if (options.BUSCO_dbs): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check BUSCO datasets provided ---------") BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path) ######### if Debug: print(colored("DEBUG: options.BUSCO_dbs", 'yellow')) print(options.BUSCO_dbs) print("+ BUSCO datasets would be downloaded when executed...") #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print("\n*************** Finish *******************\n") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Database module.\n") return ()
def run_info(options): ## project help if (options.help_project): project_help() exit() ## help_format option if (options.help_format): help_fastq_format() exit() ## information for Prokka if (options.help_Prokka): annotation.print_list_prokka() exit() ## information for BUSCO databases if (options.help_BUSCO): BUSCO_caller.print_help_BUSCO() exit() ## information for ARIBA databases if (options.help_ARIBA): print("ARIBA databases information:") ariba_caller.help_ARIBA() exit() ## information for trimm adapters if (options.help_trimm_adapters): trimmomatic_call.print_help_adapters() exit() ## information for Multiqc if (options.help_multiqc): multiQC_report.multiqc_help() exit() ## information for KMA Software if (options.help_KMA): species_identification_KMA.help_kma_database() exit() ## information for PhiSpy if (options.help_PhiSpy): bacteriophage.help_PhiSpy() exit() ## information for MGE analysis if (options.help_MGE_analysis): MGE.help_MGE_analysis() exit() ## information for MGE module if (options.help_input_MGE): MGE.help_input_MGE() exit() ## information for MLSTar Software if (options.help_MLSTar): MLSTar.help_MLSTar() exit() ## information for Min Hash Software if (options.help_Mash): min_hash_caller.helpMash() exit() ## information for Snippy if (options.help_Snippy): variant_calling.help_Snippy() exit() ## information for Dimob if (options.help_Dimob): genomic_island.help_Dimob() exit()
def run_annotation(options): ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() elif (options.help_Prokka): ## information for Prokka annotation.print_list_prokka() exit() ## set default options.batch = False ### HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly annotation") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ### symbolic links print("+ Retrieve all genomes assembled...") ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for samples outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "annot", options.debug) ## annotate print("+ Annotate assemblies using prokka:") print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode") if options.genera == 'Other': print( "\t-Option: genera = Off; No genus-specific BLAST databases option provided" ) else: print("\t-Option: genera = ", options.genera, "; Genus-specific BLAST databases option provided") print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature") print("\t-Option: addmrna; Add 'mRNA' features for each 'CDS' feature") print("\t-Option: cdsrnaolap; Allow [tr]RNA to overlap CDS") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(annot_caller, row['sample'], outdir_dict[row['name']], options, row['name'], threads_job): index for index, row in pd_samples_retrieved.iterrows() } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## get folders givenList = [v for v in outdir_dict.values()] protein_files = [] print( "+ Detail information for each sample could be identified in separate folders:" ) for folder in givenList: print('\t + ', folder) protein_files.extend( HCGB_main.retrieve_matching_files(folder, '.faa', Debug)) ### report generation if (options.skip_report): print("+ No annotation report generation...") else: ### report generation HCGB_aes.boxymcboxface("Annotation report") outdir_report = HCGB_files.create_subfolder("report", outdir) PROKKA_report = HCGB_files.create_subfolder("annotation", outdir_report) print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## check if previously report generated filename_stamp = PROKKA_report + '/.success' done = 0 if os.path.isdir(PROKKA_report): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous report generated results on: %s" % stamp, 'yellow')) done = 1 ## generate report if done == 0: ## get subdirs generated and call multiQC report module multiQC_report.multiQC_module_call(givenList, "Prokka", PROKKA_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## success stamps filename_stamp = PROKKA_report + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) ## time stamp start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total) ## Check each annotation using BUSCO results = qc.BUSCO_check(input_dir, outdir, options, start_time_partial_BUSCO, "proteins") ## print to file: results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Annotation module.") return ()
def run_QC(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_info.help_fastq_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## set main header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Quality check") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default if (options.detached): options.project = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ### option if (options.raw_reads): ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) fastqc(pd_samples_retrieved, outdir, options, start_time_total, "raw", Debug) elif (options.trim_reads): ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) fastqc(pd_samples_retrieved, outdir, options, start_time_total, "trimmed", Debug) elif (options.assembly): BUSCO_check(input_dir, outdir, options, start_time_total, "genome") elif (options.annotation): BUSCO_check(input_dir, outdir, options, start_time_total, "proteins") return ()
def BUSCO_check(input_dir, outdir, options, start_time_total, mode): HCGB_aes.boxymcboxface("BUSCO Analysis Quality check") ## absolute path for in & out database_folder = os.path.abspath(options.database) ## get files and get dir for each sample according to mode if mode == 'genome': pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) if not options.project: outdir = HCGB_files.create_subfolder("assembly_qc", outdir) if options.debug: print("** DEBUG: pd_samples_retrieved") print(pd_samples_retrieved) BUSCO_outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "assemble_qc", options.debug) elif mode == 'proteins': pd_samples_retrieved = sampleParser.files.get_files( options, outdir, "annot", ["faa"], options.debug) ## if not options.project: outdir = HCGB_files.create_subfolder("annot_qc", outdir) if options.debug: print("** DEBUG: pd_samples_retrieved") print(pd_samples_retrieved) BUSCO_outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "annot_qc", options.debug) ## add column to dataframe pd_samples_retrieved['busco_folder'] = "" for index, row in pd_samples_retrieved.iterrows(): pd_samples_retrieved.at[index, 'busco_folder'] = BUSCO_outdir_dict[ row['name']] ## debug message if (options.debug): HCGB_aes.debug_message("df_samples_busco", 'yellow') print(pd_samples_retrieved) HCGB_aes.debug_message("BUSCO_outdir_dict", 'yellow') print(BUSCO_outdir_dict) ## Check each using BUSCO database_folder = os.path.abspath(options.database) BUSCO_Database = HCGB_files.create_subfolder('BUSCO', database_folder) if not os.path.exists(BUSCO_Database): HCGB_files.create_folder(BUSCO_Database) ## call (dataFrame_results, stats_results) = BUSCO_caller.BUSCO_call( options.BUSCO_dbs, pd_samples_retrieved, BUSCO_Database, options.threads, mode) ## debug message if (options.debug): HCGB_aes.debug_message("dataFrame_results", 'yellow') HCGB_main.print_all_pandaDF(dataFrame_results) ## functions.timestamp print("+ Quality control of all samples finished: ") start_time_partial = HCGB_time.timestamp(start_time_total) ## multiqc report plot if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report BUSCO plot.") outdir_report = HCGB_files.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders." ) ## name folder according to mode if mode == 'genome': BUSCO_report = HCGB_files.create_subfolder("BUSCO_assembly", outdir_report) elif mode == 'proteins': BUSCO_report = HCGB_files.create_subfolder("BUSCO_annot", outdir_report) ## generate plots print("+ Generate summarizing plots...") BUSCO_caller.BUSCO_plots(dataFrame_results, BUSCO_report, options.threads) print('\n+ Check quality plots in folder: %s' % BUSCO_report) ## TODO ## Parse BUSCO statistics in dataframe (stats_results) for discarding samples if necessary ## given a cutoff, discard or advise to discard some samples ### print statistics stats_results.to_csv(BUSCO_report + "/BUSCO_stats.csv") name_excel = BUSCO_report + "/BUSCO_stats.xlsx" writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') stats_results.to_excel(writer, sheet_name="BUSCO statistics") writer.save() print('\n+ Check quality statistics in folder: %s' % BUSCO_report) return (dataFrame_results)