def run_phylo(options): """ Main function acting as an entry point to the module *phylo*. """ ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_project): ## information for project help_info.project_help() exit() ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Phylogenetic reconstruction") print ("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir="" ## set mode: project/detached ## Project mode as default project_mode=True if (options.detached): options.project = False project_mode=False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get the database options.database = os.path.abspath(options.database) ### parse the reference print ("+ Retrieve the reference...") reference_gbk_file = get_reference_gbk(options) ## generate output folder, if necessary print ("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ################################## ## select samples and map #################################### print ("+ Retrieve samples to map available...") dict_folders = map_samples(options, reference_gbk_file, input_dir, outdir) if Debug: print (colored("**DEBUG: dict_folders **", 'yellow')) print (dict_folders) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ################################## ## Create core alingment ################################## outdir_report = HCGB_files.create_subfolder("report", outdir) phylo_dir = HCGB_files.create_subfolder("phylo", outdir_report) analysis_dir = HCGB_files.create_subfolder(options.name, phylo_dir) snippy_dir = HCGB_files.create_subfolder("snippy", analysis_dir) list_folders = list(dict_folders.values()) options_string = "" variant_calling.snippy_core_call(list_folders, options_string, options.name, snippy_dir, options.output_format, Debug) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## snp distance matrix snp_distance_dir = HCGB_files.create_subfolder("snp_distance", analysis_dir) name_matrix = os.path.join(snp_distance_dir, "snp_matrix_" + options.name) countGaps = False aln_file = os.path.join(snippy_dir, options.name + '.aln') phylo_parser.get_snp_distance(aln_file, options.output_format, countGaps, name_matrix, Debug) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## phylogenetic analysis iqtree_output = HCGB_files.create_subfolder("iqtree", analysis_dir) phylo_parser.ml_tree(snippy_dir, options.name, options.threads, iqtree_output, Debug) ## time stamp start_time_partial = HCGB_files.timestamp(start_time_total) print ("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print ("+ Exiting Annotation module.") return()
def run_search(arg_dict): """Main function of the search module in BacDup package. This module searches and create gene duplication analysis. It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user annotation data) or a single or multiple samples. """ ## help message if (arg_dict.input_help): help_input() exit() if (arg_dict.blast_help): info.blast_help() exit() if (arg_dict.project_help): info.project_help() exit() if (arg_dict.detached_mode_help): info.detached_mode() exit() ### Start the analysis BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Search module") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out outdir = os.path.abspath(arg_dict.input_folder) ## project or detached? if arg_dict.detached: arg_dict.project = False ## output folder print("\n+ Create output folder(s):") HCGB.functions.files_functions.create_folder(outdir) else: arg_dict.project = True ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting information provided... ') print('+ Several options available:') print('\t* BacDup project folder with initiated data') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\t* Single/Multiple raw BLAST results files') print('\t* Single/Multiple fasta proteins + annotation table') print("""\n\n**** NOTE: **** For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs) use the input module to accommodate accordingly """) time.sleep(1) print() ## parse options pd_samples_retrieved = parse_search_options(arg_dict) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## for each sample dict_search_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "search", arg_dict.debug) dict_dup_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug) dict_parse_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "parse", arg_dict.debug) ## create results data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table()) for sample, folder in dict_search_folders.items(): annot_timestamp = os.path.join(dict_dup_folders[sample], '.annot_success') dup_annot_file = os.path.join(dict_dup_folders[sample], 'dup_annot.csv') ## annotation annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table'] if (not HCGB.functions.files_functions.is_non_zero_file( annot_timestamp)): ## get results file_data = pd_samples_retrieved.loc[sample, 'file_data'] format = pd_samples_retrieved.loc[sample, 'format'] filtered_data = dup_searcher.filter_data( sample, file_data, format, arg_dict.pident, arg_dict.evalue, arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug) ## timestamps filter_timestamp = os.path.join(dict_dup_folders[sample], '.filter_success') if (not HCGB.functions.files_functions.is_non_zero_file( filter_timestamp)): #save results as a .csv file sort_csv = os.path.abspath( os.path.join(dict_dup_folders[sample], 'filtered_results.csv')) filtered_data.to_csv(sort_csv, header=True, index=False) ## print time stamp HCGB_time.print_time_stamp(filter_timestamp) else: read_time = HCGB_time.read_time_stamp(filter_timestamp) print( colored( "\t+ Filter results already available for sample %s [%s]" % (sample, read_time), 'green')) ## get annotation (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot( sample, filtered_data, annot_table_file, arg_dict.debug) ## info_dup_file = os.path.join(dict_dup_folders[sample], 'info_dup.csv') data2add_entry.to_csv(info_dup_file, header=True, index=False) ## save into file dup_annot_df.to_csv(dup_annot_file, header=True) ## print time stamp HCGB_time.print_time_stamp(annot_timestamp) else: read_time = HCGB_time.read_time_stamp(annot_timestamp) print( colored( "\t+ Duplicate annotation already available for sample %s [%s]" % (sample, read_time), 'green')) ## add info for each dup_annot_df = HCGB_main.get_data(dup_annot_file, ',', "index_col=0") annot_table = HCGB_main.get_data(annot_table_file, ',', "index_col=0") data2add_entry = dup_searcher.get_dup_stats( sample, dup_annot_df, annot_table, arg_dict.debug) ## add genome length data data2add_entry['genome_len'] = '' len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv') if os.path.isfile(len_df_file): len_data = HCGB_main.get_data(len_df_file, ',', "header=None") data2add_entry['genome_len'] = len_data[1].sum() ## merge data #data2add_entry = data2add_entry.reset_index() data2add = data2add.append(data2add_entry, ignore_index=False) ### report generation HCGB_aes.boxymcboxface("Summarizing duplicated search") outdir_report = HCGB.functions.files_functions.create_subfolder( "report", outdir) dups_report = HCGB.functions.files_functions.create_subfolder( "dups", outdir_report) ## add data2add data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting search module.") return ()
def KMA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, time_partial): """Kmer identification using software KMA_. :param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in... :param pd_samples_retrieved: pandas dataframe for samples to process. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :param retrieve_databases: :param time_partial: timestamp of start time of the process. :type options: :type pd_samples_retrieved: pandas.DataFrame() :type outdir_dict: Dictionary :type retrieve_databases: pandas.DataFrame() :type time_partial: :return: Information of the identification. See example below. :rtype: pandas.DataFrame() See example of returned dataframe in file :file:`/devel/results/KMA_ident_example.csv` here: .. include:: ../../devel/results/KMA_ident_example.csv :literal: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.config.set_config.get_exe` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.modules.ident.send_kma_job` - :func:`BacterialTyper.modules.ident.get_outfile` - :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed` - :func:`BacterialTyper.scripts.species_identification_KMA.parse_kma_results` .. include:: ../../links.inc """ return (pd.DataFrame()) ### print header HCGB_aes.boxymcboxface("KMA Identification") ## set defaults kma_bin = set_config.get_exe("kma") ## check status databases2use = [] for index, db2use in retrieve_databases.iterrows(): ## index_name if (str(db2use['source']).startswith('KMA')): print('+ Check database: ' + db2use['db']) fold_name = os.path.dirname(db2use['path']) index_status = species_identification_KMA.check_db_indexed( db2use['path'], fold_name) if (index_status == True): print( colored( "\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green')) databases2use.append(db2use['path']) else: #databases2use.remove(db2use) print( colored( "\t**Databases %s is not correctly indexed. Not using it...\n" % db2use['db'], 'red')) ## debug message if (Debug): print( colored( "**DEBUG: databases2use\n" + "\n".join(databases2use) + "\n**", 'yellow')) ## Start identification of samples print("\n+ Send KMA identification jobs...") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: for db2use in databases2use: ## load database on memory print("+ Loading database on memory for faster identification.") return_code_load = species_identification_KMA.load_db( kma_bin, db2use) ## send for each sample commandsSent = { executor.submit(send_kma_job, outdir_dict[name], sorted(cluster["sample"].tolist()), name, db2use, threads_job, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## remove database from memory print("+ Removing database from memory...") return_code_rm = species_identification_KMA.remove_db( kma_bin, db2use) if (return_code_rm == 'FAIL'): print( colored( "***ERROR: Removing database from memory failed. Please do it manually! Execute command: %s" % cmd_rm_db, 'red')) ## functions.timestamp time_partial = HCGB_time.timestamp(time_partial) ## parse results print("+ KMA identification call finished for all samples...") print("+ Parse results now") results_summary = pd.DataFrame() for db2use in databases2use: ### [TODO]: parse data according to database: bacteria, plasmids or user data or genbank data provided basename_db = os.path.basename(db2use) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) ### for name, cluster in sample_frame: ## get result ## outdir_KMA outdir_dict_kma = HCGB_files.create_subfolder( "kma", outdir_dict[name]) result = get_outfile(outdir_dict_kma, name, db2use) #print ('\t- File: ' + result + '.spa') ## get results using a cutoff value [Defaulta: 80] results = species_identification_KMA.parse_kma_results( result + '.spa', options.KMA_cutoff) results['Database'] = basename_db ### check if db2use is plasmids as it could be several. if (results.index.size > 1): if (basename_db == "plasmids.T" or basename_db == "viral.TG"): ## let it be several entries results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) else: print( colored("###########################################", 'yellow')) print( colored("Sample %s contains multiple strains." % name, 'yellow')) print( colored("###########################################", 'yellow')) print(colored(results, 'yellow')) print('\n\n') ## add both strains if detected results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) ## TODO: add multi-isolate flag elif (results.index.size == 1): ## 1 clear reference results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) else: print( colored( '\tNo clear strain from database %s has been assigned to sample %s' % (basename_db, name), 'yellow')) ## add empty line if no available results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) print("+ Finish this step...") ## debug message if (Debug): results_summary.to_csv(quotechar='"') return (results_summary)
def run_ident(options): """ Main function acting as an entry point to the module *ident*. Arguments: .. seealso:: Additional information to PubMLST available datasets. - :doc:`PubMLST datasets<../../../data/PubMLST_datasets>` """ ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_KMA): ## information for KMA Software species_identification_KMA.help_kma_database() exit() elif (options.help_MLSTar): ## information for KMA Software MLSTar.help_MLSTar() exit() ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ### species_identification_KMA -> most similar taxa HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Species identification") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default global Project if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) Project = False else: options.project = True outdir = input_dir Project = True ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for each sample outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "ident", options.debug) ## let's start the process print( "+ Generate an species typification for each sample retrieved using:") print("(1) Kmer alignment (KMA) software.") print("(2) Pre-defined databases by KMA or user-defined databases.") ## get databases to check retrieve_databases = get_options_db(options) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## debug message if (Debug): print(colored("**DEBUG: retrieve_database **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print(retrieve_databases) ######## KMA identification dataFrame_kma = KMA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, start_time_partial) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## debug message if (Debug): print(colored("**DEBUG: retrieve results to summarize **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print("dataframe_kma") print(dataFrame_kma) ## exit if viral search skip = False if (len(options.kma_dbs) == 1): for i in options.kma_dbs: if (i == 'viral'): print() MLST_results = '' options.fast = True skip = True ## what if only plasmids? ## do edirect and MLST if bacteria if (not skip): dataFrame_edirect = pd.DataFrame() ######## EDirect identification #dataFrame_edirect = edirect_ident(dataFrame_kma, outdir_dict, Debug) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## debug message if (Debug): print(colored("**DEBUG: retrieve results from NCBI **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print("dataFrame_edirect") print(dataFrame_edirect) ######## MLST identification MLST_results = MLST_ident(options, dataFrame_kma, outdir_dict, dataFrame_edirect, retrieve_databases) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## debug message if (Debug): print( colored("**DEBUG: retrieve results to summarize **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print("MLST_results") print(MLST_results) ## generate summary for sample: all databases ## MLST, plasmids, genome, etc HCGB_aes.boxymcboxface("Results Summary") ##################################### ## Summary identification results ## ##################################### ## parse results if options.project: final_dir = os.path.join(outdir, 'report', 'ident') HCGB_files.create_folder(final_dir) else: final_dir = outdir ### excel_folder = HCGB_files.create_subfolder("samples", final_dir) print('+ Print summary results in folder: ', final_dir) print('+ Print sample results in folder: ', excel_folder) # Group dataframe results summary by sample name sample_results_summary = dataFrame_kma.groupby(["Sample"]) ## debug message if (Debug): print(colored("**DEBUG: sample_results_summary **", 'yellow')) print(sample_results_summary) ## results_summary_KMA = pd.DataFrame() MLST_all = pd.DataFrame() for name, grouped in sample_results_summary: ## create a excel and txt for sample name_sample_excel = excel_folder + '/' + name + '_ident.xlsx' name_sample_csv = outdir_dict[ name] + '/ident_summary.csv' ## check in detached mode writer_sample = pd.ExcelWriter( name_sample_excel, engine='xlsxwriter') ## open excel handle ## subset dataframe & print result results_summary_toPrint_sample = grouped[[ 'Sample', '#Template', 'Query_Coverage', 'Template_Coverage', 'Depth', 'Database' ]] results_summary_toPrint_sample.to_excel( writer_sample, sheet_name="KMA") ## write excel handle results_summary_toPrint_sample.to_csv( name_sample_csv) ## write csv for sample ## read MLST if MLST_results: if name in MLST_results: sample_MLST = pd.read_csv(MLST_results[name], header=0, sep=',') sample_MLST['genus'] = dataFrame_edirect.loc[ dataFrame_edirect['sample'] == name, 'genus'].values[0] sample_MLST['species'] = dataFrame_edirect.loc[ dataFrame_edirect['sample'] == name, 'species'].values[0] sample_MLST.to_excel(writer_sample, sheet_name="MLST") ## write excel handle ## Return information to excel MLST_all = pd.concat([MLST_all, sample_MLST]) ## close excel handle writer_sample.save() ## name_excel = final_dir + '/identification_summary.xlsx' print('+ Summary information in excel file: ', name_excel) writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') ## open excel handle ## KMA dataframe: print result for sources results_summary_KMA = dataFrame_kma[[ 'Sample', '#Template', 'Query_Coverage', 'Template_Coverage', 'Depth', 'Database' ]] ## Sum plasmid and chromosome statistics ## ## sum coverage total_coverage = results_summary_KMA.groupby( 'Sample')['Query_Coverage'].sum().reset_index() ## debug message if (Debug): print("*** Sum: Query_coverage ***") print(total_coverage) ## TODO: FIX SUMMARY REPORT results_summary_KMA = results_summary_KMA.set_index('Sample') results_summary_KMA = results_summary_KMA.sort_values( by=['Sample', 'Database', 'Query_Coverage'], ascending=[True, True, True]) results_summary_KMA.to_excel(writer, sheet_name='KMA') ## write excel handle ## write MLST if (MLST_results): MLST_all.to_excel(writer, sheet_name='MLST') ## write excel and close writer.save() ## close excel handle print("\n+ Check summary of results in file generated") ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ###################################### ## update database for later usage ###################################### if not options.fast: HCGB_aes.boxymcboxface("Update Sample Database") ## update db print("+ Update database with samples identified") ## debug message if (Debug): print(colored("**DEBUG: dataFrame_edirect **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print(dataFrame_edirect) ## dataFrame_edirect file_toprint = final_dir + '/edirect_info2download.csv' dataFrame_edirect.to_csv(file_toprint) ## update database with samples identified data2download = dataFrame_edirect.filter( ['genus', 'species', 'strain', 'genome']) data2download = data2download.rename(columns={ 'genome': 'NCBI_assembly_ID', 'strain': 'name' }) NCBI_folder = os.path.abspath(options.database) + '/NCBI' database_generator.NCBI_DB(data2download, NCBI_folder, Debug) else: print( "+ No update of the database has been requested using option --fast" ) print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting identification module.") return ()
def fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysis, Debug): HCGB_aes.boxymcboxface("FASTQC Quality check for samples") ## debug message if (Debug): print(colored("\n**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) print("\n") ## generate output folder, if necessary print("\n+ Create output folder(s):") ## if not project, outdir contains the dir to put output ## in this case, in some other cases might not occur if not options.project: functions.create_folder(outdir) outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "fastqc_" + name_analysis, options.debug) print("+ Checking quality for each sample retrieved...") start_time_partial = start_time_total # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): HCGB_aes.debug_message("options.threads: " + str(options.threads), "yellow") HCGB_aes.debug_message("max_workers: " + str(max_workers_int), "yellow") HCGB_aes.debug_message("threads_job: " + str(threads_job), "yellow") ## send for each sample print("+ Calling fastqc for samples...") with concurrent.futures.ThreadPoolExecutor( max_workers=int(max_workers_int)) as executor: commandsSent = { executor.submit(fastqc_caller.run_module_fastqc, outdir_dict[name], sorted(cluster["sample"].tolist()), name, threads_job): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("+ FASTQC for samples has finished...") ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report using MultiQC module.") outdir_report = HCGB_files.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") fastqc_report = HCGB_files.create_subfolder("FASTQC", outdir_report) fastqc_final_report = HCGB_files.create_subfolder( name_analysis, fastqc_report) multiQC_report.multiQC_module_call(my_outdir_list, "FASTQC", fastqc_final_report, "") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % fastqc_final_report) print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting qc module.") exit()
def run_input(arg_dict): """Main function of the input_parser module in BacDup package. This module prepares data for later gene duplication analysis. It allows the user to provide either a single sample, multiple samples, NCBI GenBank IDs or NCBI taxonomy IDs to retrieve and obtain the annotation data. """ ## help message if (arg_dict.input_help): help_input() exit() BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Preparing input files") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out #input_dir = os.path.abspath(options.input) outdir = os.path.abspath(arg_dict.output_folder) ## output folder print("\n+ Create output folder(s):") HCGB_files.create_folder(outdir) ## set defaults if not (arg_dict.assembly_level): arg_dict.assembly_level = 'complete' if not (arg_dict.section): arg_dict.section = 'genbank' ## project or detached? if arg_dict.detached: arg_dict.project = False final_dir = outdir data_dir = outdir else: arg_dict.project = True print( "+ Generate a directory containing information within the project folder provided" ) final_dir = HCGB_files.create_subfolder("info", outdir) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('final_dir:' + final_dir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting input information provided... ') print('+ Several options available:') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\n\t* Single/Multiple NCBI GenBank IDs') print('\n\t* Single/Multiple NCBI taxonomy IDs + Options') print('\n\t* A previous BacDup project folder') print('\n+ Check the option provided...') time.sleep(1) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ################################################# ## Parse and obtain the type of input information provided ################################################# df_accID = parse_options(arg_dict) ## pd.DataFrame: 'new_name','folder','genus', ## 'species','taxonomy','genome', ## 'annot_file','format_annot_file', 'proteins', ## 'plasmids_number','plasmids_ID')) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## parse information accordingly parse_information(arg_dict, df_accID, outdir) ### report generation HCGB_aes.boxymcboxface("Summarizing input files") outdir_report = HCGB_files.create_subfolder("report", outdir) input_report = HCGB_files.create_subfolder("input", outdir_report) ## add df_accID.loc[sample,] information as csv into input folder df_accID.to_csv(os.path.join(input_report, 'info.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Input module.") return ()
def run_report(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_spaTyper): ## help_format option get_spa_typing.help_spaTyper() exit() elif (options.help_project): ## information for project help_info.project_help() exit() ## set default options.batch = False ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Report generation module") print("--------- Starting Process ---------") HCGB_time.print_time() ## call assemble using spades start_time_partial = start_time_total ## absolute path for in & out options.database = os.path.abspath(options.database) global input_dir input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached global Project if (options.detached): options.project = False outdir = os.path.abspath(options.output_folder) Project = False else: options.project = True outdir = input_dir Project = True ## print("\n+ Get project information:") ## get files: trimm, assembly, annotation pd_samples_retrieved = database_user.get_userData_files(options, input_dir) pd_samples_retrieved['new_name'] = pd_samples_retrieved['name'] ## get info: profile, ident, cluster, MGE pd_samples_info = database_user.get_userData_info(options, input_dir) ## get databases to list #retrieve_databases = get_options_db(options) ## create output files outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "report", options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) print(colored("**DEBUG: pd_samples_info **", 'yellow')) print(pd_samples_info) ## generate output folder, if necessary print( "\n\n\n+ Generate a report summarizing analysis and sample information" ) if not options.project: HCGB_files.create_folder(outdir) outdir_report = outdir else: ### report generation outdir_report = HCGB_files.create_subfolder("report", outdir) ## create report with all data summary_report = HCGB_files.create_subfolder("summary_report", outdir_report) print("Folder: ", summary_report) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######################################## ## create species specific report if any ######################################## if (options.species_report): ## Saureus if options.species_report == "Saureus": Saureus_specific(pd_samples_retrieved, pd_samples_info, options, summary_report, outdir_dict) ## else ## to add accordingly ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ########################################################### ## create gene fasta sequences retrieval if desired ########################################################### if options.genes_ids_fasta: ## given a list of genes ids, retrieve sequence for all samples from profile if os.path.isfile(os.path.abspath(options.genes_ids_fasta)): in_file = os.path.abspath(options.genes_ids_fasta) gene_names = [line.rstrip('\n') for line in open(in_file)] print( '+ Retrieve selected genes sequences from the profile analysis for each sample.' ) print('+ Searching gene:') ## get profiles available results_geneIDs = pd.DataFrame(columns=('sample', 'gene', 'id', 'sequence')) sample_frame = pd_samples_info.groupby(["name"]) for g in gene_names: print("\t+", g) for name, cluster_df in sample_frame: my_list_profiles = cluster_df.loc[ cluster_df['tag'] == 'profile']['ext'].to_list() if options.debug: print("name: ", name) print("my_list_profiles:") print(my_list_profiles) for p in my_list_profiles: main_profile_folder = cluster_df.loc[ cluster_df['ext'] == p]['dirname'].to_list()[0] p = p.lower() if p == 'vfdb': p = p + '_full' profile_folder = os.path.join(main_profile_folder, p) (seq_id, seq_sequence ) = retrieve_genes.retrieve_genes_ids_sequences( profile_folder, g, Debug) if (seq_id): ## save results results_geneIDs.loc[len(results_geneIDs)] = ( name, g, seq_id, seq_sequence) ## save for each gene in a separate fasta file list_of_genes = set(results_geneIDs['gene'].to_list()) ## debug if Debug: print("** DEBUG **") print(results_geneIDs) print(list_of_genes) ## Save results genes_folder = HCGB_files.create_subfolder('genes', summary_report) for gene_retrieved in list_of_genes: this_frame = results_geneIDs[results_geneIDs['gene'] == gene_retrieved] gene_retrieved_file = os.path.join(genes_folder, gene_retrieved) gene_retrieved_fasta = gene_retrieved_file + ".fasta" gene_retrieved_info = gene_retrieved_file + "_info.txt" fasta_hd = open(gene_retrieved_fasta, 'w') info_hd = open(gene_retrieved_info, 'w') for item, row in this_frame.iterrows(): string2write = ">" + row['sample'] + '_' + row[ 'gene'] + '\n' + row['sequence'] + '\n' string2write_info = row['sample'] + '\t' + row[ 'gene'] + '\t' + row['id'] + '\n' fasta_hd.write(string2write) info_hd.write(string2write_info) fasta_hd.close() info_hd.close() ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######################################## ## create gene promoter fasta sequences retrieval if desired ######################################## if options.promoter_bp: ## retrieve as many bp as necessary from genes_ids_fasta print("** THIS OPTION IS NOT IMPLEMENTED YET... **") #get_promoter.get_promoter(file, geneOfInterest, basePairs, sampleName, option, debug=False): ######################################## ## create gene specific report if any ######################################## if options.genes_ids_profile: if options.species_report == "Saureus": if Debug: print("** options.genes_ids_profile **") print("Analysis already done for Saureus") else: in_file = os.path.abspath(options.genes_ids_profile) gene_names = [line.rstrip('\n') for line in open(in_file)] results_Profiles = retrieve_genes.get_genes_profile( pd_samples_info, gene_names, options.debug, "name") if options.debug: print("results_Profiles") print(results_Profiles) ## open excel writer name_excel = summary_report + '/gene_ids_profile.xlsx' writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') results_Profiles.to_excel(writer, sheet_name="gene_ids") ## close writer.save() ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ############################################### ## Search for any additional fasta sequence ############################################### if options.genes_fasta: ## given a list of fasta sequences search using blast against proteins annotated or genome print("** THIS OPTION IS NOT IMPLEMENTED YET... **") print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Report generation module.") return ()
def run_profile(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() if (options.help_project): ## information for project help_info.project_help() exit() if (options.help_ARIBA): ## help_format option ariba_caller.help_ARIBA() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Virulence & Resistance profile module") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out options.database = os.path.abspath(options.database) global input_dir input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached global Project if (options.detached): options.project = False outdir = os.path.abspath(options.output_folder) Project = False else: options.project = True outdir = input_dir Project = True ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for each sample outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "profile", options.debug) ### print( "+ Generate a sample profile for virulence and resistance candidate genes for each sample retrieved using:" ) print( "(1) Antimicrobial Resistance Inference By Assembly (ARIBA) software") print( "(2) Pre-defined databases by different suppliers or user-defined databases." ) ## get databases to check retrieve_databases = get_options_db(options) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_total) ######## ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, start_time_partial) ###################################### ## update database for later usage ###################################### if not options.fast: ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) HCGB_aes.boxymcboxface("Update Sample Database") ## update db print("+ Update database with samples identified") ## TODO: check if it works dataBase_user = database_user.update_database_user_data( options.database, input_dir, Debug, options) ## debug message if (Debug): print(colored("**DEBUG: results obtained **", 'yellow')) else: print( "+ No update of the database has been requested using option --fast" ) print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Virulence & Resistance profile module.") return ()
def run(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_info.help_fastq_format() exit() elif (options.help_trimm_adapters): ## help on trimm adapters trimmomatic_call.print_help_adapters() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Trimming samples") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default if (options.detached): options.project = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) ## debug message if (Debug): HCGB_aes.debug_message("pd_samples_retrieved", 'yellow') HCGB_main.print_all_pandaDF(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for samples outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "trimm", options.debug) ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) print("+ Trimming adapters for each sample retrieved...") # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) # Trimming adapters if (options.adapters): # Adapter file provided options.adapters = os.path.abspath(options.adapters) print("\t- Adapters file provided...") else: # Get default adpaters file print("\t- Default Trimmomatic adapters (v0.39) will be used...") options.adapters = data_files.data_list( "available_Trimmomatic_adapters") ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(trimmo_caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, Debug, options.adapters): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("\n\n+ Trimming samples has finished...") ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_total) ## get files generated and generate symbolic link if not options.project: dir_symlinks = HCGB_files.create_subfolder('link_files', outdir) files2symbolic = [] folders = os.listdir(outdir) ## debug message if (Debug): print( colored( "**DEBUG: generate symbolic links for each file in " + dir_symlinks + "**", 'yellow')) for fold in folders: if fold.endswith(".log"): continue else: this_folder = outdir + '/' + fold subfiles = os.listdir(this_folder) for files in subfiles: files_search = re.search( r".*trim_R\d{1}.*", files) ## only paired-end. Todo: single end if files_search: files2symbolic.append(this_folder + '/' + files) HCGB_files.get_symbolic_link(files2symbolic, dir_symlinks) if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report using MultiQC module.") outdir_report = HCGB_files.create_subfolder("report", outdir) ## call multiQC report module givenList = [v for v in outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): HCGB_aes.debug_message("my_outdir_list for multiqc report", "yellow") print(my_outdir_list) print("\n") trimm_report = HCGB_files.create_subfolder("trimm", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "Trimmomatic", trimm_report, "") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % trimm_report) ## create fastqc for trimmed reads pd_samples_retrieved_trimmed = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, start_time_partial, "trimmed", Debug) print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("\n+ Exiting trimm module.") return ()
def run_annotation(options): ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() elif (options.help_Prokka): ## information for Prokka annotation.print_list_prokka() exit() ## set default options.batch = False ### HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly annotation") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ### symbolic links print("+ Retrieve all genomes assembled...") ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for samples outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "annot", options.debug) ## annotate print("+ Annotate assemblies using prokka:") print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode") if options.genera == 'Other': print( "\t-Option: genera = Off; No genus-specific BLAST databases option provided" ) else: print("\t-Option: genera = ", options.genera, "; Genus-specific BLAST databases option provided") print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature") print("\t-Option: addmrna; Add 'mRNA' features for each 'CDS' feature") print("\t-Option: cdsrnaolap; Allow [tr]RNA to overlap CDS") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(annot_caller, row['sample'], outdir_dict[row['name']], options, row['name'], threads_job): index for index, row in pd_samples_retrieved.iterrows() } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## get folders givenList = [v for v in outdir_dict.values()] protein_files = [] print( "+ Detail information for each sample could be identified in separate folders:" ) for folder in givenList: print('\t + ', folder) protein_files.extend( HCGB_main.retrieve_matching_files(folder, '.faa', Debug)) ### report generation if (options.skip_report): print("+ No annotation report generation...") else: ### report generation HCGB_aes.boxymcboxface("Annotation report") outdir_report = HCGB_files.create_subfolder("report", outdir) PROKKA_report = HCGB_files.create_subfolder("annotation", outdir_report) print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## check if previously report generated filename_stamp = PROKKA_report + '/.success' done = 0 if os.path.isdir(PROKKA_report): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous report generated results on: %s" % stamp, 'yellow')) done = 1 ## generate report if done == 0: ## get subdirs generated and call multiQC report module multiQC_report.multiQC_module_call(givenList, "Prokka", PROKKA_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## success stamps filename_stamp = PROKKA_report + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) ## time stamp start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total) ## Check each annotation using BUSCO results = qc.BUSCO_check(input_dir, outdir, options, start_time_partial_BUSCO, "proteins") ## print to file: results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Annotation module.") return ()
def run_biotype(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_XICRA.help_fastq_format() elif (options.help_project): ## information for project help_XICRA.project_help() exit() elif (options.help_RNAbiotype): ## information for join reads RNAbiotype.help_info() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True aesthetics_functions.pipeline_header('XICRA') aesthetics_functions.boxymcboxface("RNA biotype analysis") print("--------- Starting Process ---------") time_functions.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.detached): outdir = os.path.abspath(options.output_folder) options.project = False else: options.project = True outdir = input_dir ## get files print('+ Getting files from input folder... ') ## get files if options.noTrim: print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: print('+ Mode: trim.\n+ Extension: ') print("[ _trim_ ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## Discard if joined reads: use trimmed single-end or paired-end pd_samples_retrieved = pd_samples_retrieved[ pd_samples_retrieved['ext'] != '_joined'] ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: files_functions.create_folder(outdir) ## for samples mapping_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "map", options.debug) ## debug message if (Debug): print(colored("**DEBUG: mapping_outdir_dict **", 'yellow')) print(mapping_outdir_dict) # time stamp start_time_partial = time_functions.timestamp(start_time_total) ## optimize threads name_list = set(pd_samples_retrieved["new_name"].tolist()) threads_job = main_functions.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ############################################## ## map Reads ############################################## start_time_partial = mapReads_module(options, pd_samples_retrieved, mapping_outdir_dict, options.debug, max_workers_int, threads_job, start_time_partial, outdir) ## debug message if (Debug): print(colored("**DEBUG: mapping_results **", 'yellow')) print(mapping_results) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) ## for samples biotype_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "biotype", options.debug) ## debug message if (Debug): print(colored("**DEBUG: biotype_outdir_dict **", 'yellow')) print(biotype_outdir_dict) ## get RNAbiotype information RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict, options.annotation, options.debug, max_workers_int, threads_job) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print( "\n+ Generating a report using MultiQC module for featureCount analysis." ) outdir_report = files_functions.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in biotype_outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") featureCount_report = files_functions.create_subfolder( "featureCount", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "featureCount", featureCount_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % featureCount_report) ### Summarizing RNA biotype information biotype_report = files_functions.create_subfolder( "biotype", outdir_report) single_files_biotype = files_functions.create_subfolder( "samples", biotype_report) ## results dict_files = {} for samples in biotype_outdir_dict: featurecount_file = os.path.join(biotype_outdir_dict[samples], 'featureCount.out.tsv') if files_functions.is_non_zero_file(featurecount_file): dict_files[samples] = featurecount_file ## copy pdf pdf_plot = main_functions.retrieve_matching_files( biotype_outdir_dict[samples], '.pdf', options.debug) if files_functions.is_non_zero_file(pdf_plot[0]): shutil.copy(pdf_plot[0], single_files_biotype) ## collapse all information all_data = RNAbiotype.generate_matrix(dict_files) ## print into excel/csv print('+ Table contains: ', len(all_data), ' entries\n') ## debugging messages if Debug: print("** DEBUG: all_data") print(all_data) ## set abs_csv_outfile to be in report folder ## copy or link files for each sample analyzed abs_csv_outfile = os.path.join(biotype_report, "summary.csv") all_data.to_csv(abs_csv_outfile) ## create plot: call R [TODO: implement in python] outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf") ## R scripts biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum', options.debug) rscript = set_config.get_exe("Rscript", options.debug) cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script, abs_csv_outfile, outfile_pdf) ## print("+ Create summary plot for all samples") callCode = system_call_functions.system_call(cmd_R_plot) print("\n*************** Finish *******************") start_time_partial = time_functions.timestamp(start_time_total) print("\n+ Exiting join module.") return ()
def mapReads_module(options, pd_samples_retrieved, outdir_dict, Debug, max_workers_int, threads_job, start_time_partial, outdir): # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["new_name"]) ## options STAR_exe = set_config.get_exe("STAR", Debug=Debug) cwd_folder = os.path.abspath("./") folder = files_functions.create_subfolder('STAR_files', cwd_folder) ## For many samples it will have to load genome index in memory every time. ## For a unique sample it will not matter. Take care genome might stay in memory. ## Use before loop option LoadAndExit and then: ## in loop ## Use option LoadAndKeep, set shared memory > 30 Gb ## when finished loop Remove memory ## check reference if (options.fasta): print("+ Genome fasta file provided") print("+ Create genomeDir for later usage...") options.fasta = os.path.abspath(options.fasta) ## create genomeDir options.genomeDir = mapReads.create_genomeDir(folder, STAR_exe, options.threads, options.fasta, options.limitRAM) elif (options.genomeDir): print("+ genomeDir provided.") options.genomeDir = os.path.abspath(options.genomeDir) ## remove previous reference genome from memory print("+ Remove genome in memory from previous call... (if any)") mapReads.remove_Genome(STAR_exe, options.genomeDir, folder, options.threads) ## load reference genome mapReads.load_Genome(folder, STAR_exe, options.genomeDir, options.threads) ## functions.time_functions.timestamp start_time_partial = time_functions.timestamp(start_time_partial) print("+ Mapping sequencing reads for each sample retrieved...") ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(mapReads_caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, STAR_exe, options.genomeDir, options.limitRAM, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("\n\n+ Mapping reads has finished...") ## functions.time_functions.timestamp start_time_partial = time_functions.timestamp(start_time_partial) ## remove reference genome from memory mapReads.remove_Genome(STAR_exe, options.genomeDir, folder, options.threads) ## functions.time_functions.timestamp start_time_partial = time_functions.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report using MultiQC module.") outdir_report = files_functions.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") map_report = files_functions.create_subfolder("STAR", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "STAR", map_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % map_report) return (start_time_partial)
def BUSCO_check(input_dir, outdir, options, start_time_total, mode): HCGB_aes.boxymcboxface("BUSCO Analysis Quality check") ## absolute path for in & out database_folder = os.path.abspath(options.database) ## get files and get dir for each sample according to mode if mode == 'genome': pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) if not options.project: outdir = HCGB_files.create_subfolder("assembly_qc", outdir) if options.debug: print("** DEBUG: pd_samples_retrieved") print(pd_samples_retrieved) BUSCO_outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "assemble_qc", options.debug) elif mode == 'proteins': pd_samples_retrieved = sampleParser.files.get_files( options, outdir, "annot", ["faa"], options.debug) ## if not options.project: outdir = HCGB_files.create_subfolder("annot_qc", outdir) if options.debug: print("** DEBUG: pd_samples_retrieved") print(pd_samples_retrieved) BUSCO_outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "annot_qc", options.debug) ## add column to dataframe pd_samples_retrieved['busco_folder'] = "" for index, row in pd_samples_retrieved.iterrows(): pd_samples_retrieved.at[index, 'busco_folder'] = BUSCO_outdir_dict[ row['name']] ## debug message if (options.debug): HCGB_aes.debug_message("df_samples_busco", 'yellow') print(pd_samples_retrieved) HCGB_aes.debug_message("BUSCO_outdir_dict", 'yellow') print(BUSCO_outdir_dict) ## Check each using BUSCO database_folder = os.path.abspath(options.database) BUSCO_Database = HCGB_files.create_subfolder('BUSCO', database_folder) if not os.path.exists(BUSCO_Database): HCGB_files.create_folder(BUSCO_Database) ## call (dataFrame_results, stats_results) = BUSCO_caller.BUSCO_call( options.BUSCO_dbs, pd_samples_retrieved, BUSCO_Database, options.threads, mode) ## debug message if (options.debug): HCGB_aes.debug_message("dataFrame_results", 'yellow') HCGB_main.print_all_pandaDF(dataFrame_results) ## functions.timestamp print("+ Quality control of all samples finished: ") start_time_partial = HCGB_time.timestamp(start_time_total) ## multiqc report plot if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report BUSCO plot.") outdir_report = HCGB_files.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders." ) ## name folder according to mode if mode == 'genome': BUSCO_report = HCGB_files.create_subfolder("BUSCO_assembly", outdir_report) elif mode == 'proteins': BUSCO_report = HCGB_files.create_subfolder("BUSCO_annot", outdir_report) ## generate plots print("+ Generate summarizing plots...") BUSCO_caller.BUSCO_plots(dataFrame_results, BUSCO_report, options.threads) print('\n+ Check quality plots in folder: %s' % BUSCO_report) ## TODO ## Parse BUSCO statistics in dataframe (stats_results) for discarding samples if necessary ## given a cutoff, discard or advise to discard some samples ### print statistics stats_results.to_csv(BUSCO_report + "/BUSCO_stats.csv") name_excel = BUSCO_report + "/BUSCO_stats.xlsx" writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') stats_results.to_excel(writer, sheet_name="BUSCO statistics") writer.save() print('\n+ Check quality statistics in folder: %s' % BUSCO_report) return (dataFrame_results)
def run_assembly(options): """Main function of the assemble module. It assembles each sample using SPADES_ and checks quality using BUSCO_ software and database. .. seealso:: This function depends on other BacterialTyper and HCGB functions called: - :func:`BacterialTyper.scripts.BUSCO_caller.print_help_BUSCO` - :func:`BacterialTyper.scripts.multiQC_report.multiqc_help` - :func:`BacterialTyper.modules.qc.BUSCO_check` - :func:`HCGB.sampleParser` - :func:`HCGB.functions.aesthetics_functions` - :func:`HCGB.functions.time_functions` - :func:`HCGB.functions.main_functions` - :func:`HCGB.functions.file_functions` .. include:: ../../links.inc """ ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_info.help_fastq_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() exit() ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly module") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "assemble", options.debug) ### call assemble using spades start_time_partial = start_time_total start_time_partial_assembly = start_time_partial ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): HCGB_aes.debug_message("options.threads: " + str(options.threads), "yellow") HCGB_aes.debug_message("max_workers: " + str(max_workers_int), "yellow") HCGB_aes.debug_message("cpu_here: " + str(threads_job), "yellow") # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) # We can use a with statement to ensure threads are cleaned up promptly print('+ Running modules SPADES...') with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: ## send for each sample commandsSent = { executor.submit(check_sample_assembly, name, outdir_dict[name], sorted(cluster["sample"].tolist()), threads_job): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## functions.timestamp print("\n+ Assembly of all samples finished: ") start_time_partial = HCGB_time.timestamp(start_time_partial_assembly) ## if (assembly_stats): ################### if Debug: HCGB_aes.debug_message("assembly_stats dictionary", "yellow") print(assembly_stats) ## create single file get_assembly_stats_all(assembly_stats, outdir, Debug) ### symbolic links print("+ Retrieve all genomes assembled...") ### BUSCO check assembly if (options.no_BUSCO): print() else: results = qc.BUSCO_check(outdir, outdir, options, start_time_partial, "genome") ## print to file results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Assembly module.") return ()
def run_report(arg_dict): """Main function of the plot and report generation module in BacDup package. This module searches and create gene duplication analysis. It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user annotation data) or a single or multiple samples. """ ## help message if (arg_dict.input_help): help_input() exit() if (arg_dict.project_help): info.project_help() exit() ### Start the analysis BacDup_functions.pipeline_header('BacDup') HCGB.functions.aesthetics_functions.boxymcboxface( "Report generation module") print("--------- Starting Process ---------") time_functions.print_time() ## init time start_time_total = time.time() ## absolute path for in & out outdir = os.path.abspath(arg_dict.input_folder) ## default arg_dict.project = True arg_dict.batch = False ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: Not available', 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB.functions.aesthetics_functions.print_sepLine("-", 50, False) print('+ Getting information provided... ') time.sleep(1) print() ## parse options pd_samples_dups = sampleParser.files.get_files(arg_dict, outdir, "dups", ["dup_annot.csv"], arg_dict.debug) pd_samples_dups = pd_samples_dups.drop(["dirname", "name", "ext", "tag"], axis=1) pd_samples_dups = pd_samples_dups.rename(index=str, columns={'sample': 'file_data'}) pd_samples_dups['format'] = 'dup_annot' ## debug messages if (arg_dict.debug): debug_message("pd_samples_dups", 'yellow') HCGB.functions.main_functions.print_all_pandaDF(pd_samples_dups) pd_info = sampleParser.files.get_files(arg_dict, outdir, "parse", ["length_df.csv"], arg_dict.debug) pd_info = pd_info.drop(["dirname", "name", "ext", "tag"], axis=1) pd_info = pd_info.rename(index=str, columns={'sample': 'length_table'}) ## debug messages if (arg_dict.debug): debug_message("pd_info", 'yellow') HCGB.functions.main_functions.print_all_pandaDF(pd_info) ## merge into pd_samples_retrieved pd_samples_retrieved = pd.merge(pd_samples_dups, pd_info) ## debugging messages if arg_dict.debug: debug_message("pd_samples_retrieved", 'yellow') HCGB.functions.main_functions.print_all_pandaDF(pd_samples_retrieved) ## time stamp start_time_partial = time_functions.timestamp(start_time_total) ## for each sample dict_plot_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "dup_plot", arg_dict.debug) ## debugging messages if arg_dict.debug: debug_message("dict_plot_folders", 'yellow') print(dict_plot_folders) ## create results for sample, folder in dict_plot_folders.items(): plot_timestamp = os.path.join(dict_plot_folders[sample], '.plot_success') dup_annot_file = pd_samples_retrieved[pd_samples_retrieved['new_name'] == sample][['file_data' ]].values[0][0] length_info_file = pd_samples_retrieved[ pd_samples_retrieved['new_name'] == sample][['length_table' ]].values[0][0] print(dup_annot_file) print(length_info_file)
def ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, start_time_partial): HCGB_aes.boxymcboxface("ARIBA Identification") ################## ## check status ## ################## databases2use = [] ## path, db name card_trick_info = "" print('+ Check databases status: ') for index, db2use in retrieve_databases.iterrows(): ## index_name if (db2use['source'] == 'ARIBA'): index_status = ariba_caller.check_db_indexed(db2use['path'], 'YES') if (index_status == True): #print (colored("\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green')) databases2use.append([db2use['path'], db2use['db']]) ## prepare card database ontology for later if (db2use['db'] == 'card'): card_trick_info = card_trick_caller.prepare_card_data( options.database) ## check status of other databases if any # else: ## debug message if (Debug): print(colored("**DEBUG: databases2use\n**", 'yellow')) print(databases2use) if (card_trick_info): print( colored("**DEBUG: card_trick_info: " + card_trick_info + " **", 'yellow')) ###################################################### ## Start identification of samples ###################################################### print("\n+ Send ARIBA identification jobs...") ## get outdir folders outdir_samples = pd.DataFrame(columns=('sample', 'dirname', 'db', 'output')) # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) for name, cluster in sample_frame: for db2use in databases2use: tmp = get_outfile(outdir_dict[name], name, db2use[0]) outdir_samples.loc[len(outdir_samples)] = (name, outdir_dict[name], db2use[1], tmp) ## multi-index outdir_samples = outdir_samples.set_index(['sample', 'db']) ## debug message if (Debug): print(colored("**DEBUG: outdir_samples **", 'yellow')) print(outdir_samples) ###################################################### ## send for each sample ###################################################### ## ariba assembly cutoff if not (options.ARIBA_cutoff): options.ARIBA_cutoff = 0.90 ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## loop results_df = pd.DataFrame() with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: for db2use in databases2use: print(colored("+ Working with database: " + db2use[1], 'yellow')) ## send for each sample commandsSent = { executor.submit( ariba_run_caller, db2use[0], db2use[1], ## database path & dbname sorted(cluster["sample"].tolist()), ## files outdir_samples.loc[(name, db2use[1]), 'output'], ## output threads_job, options.ARIBA_cutoff): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("+ Jobs finished for database %s ..." % db2use[1]) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print() print( "+ Collecting information for each sample analyzed for database: " + db2use[1]) ## check results for each database results_df_tmp = virulence_resistance.check_results( db2use[1], outdir_samples, options.ARIBA_cutoff, card_trick_info) results_df = pd.concat([results_df, results_df_tmp]) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ###################################################### ## Generate final report for all samples ###################################################### ## ariba summary results all samples print( "\n + Generate a summary file for all samples and one for each database employed..." ) ## parse results if Project: final_dir = input_dir + '/report/profile' HCGB_files.create_folder(final_dir) else: final_dir = os.path.abspath(options.output_folder) ## vfdb = False subfolder = HCGB_files.create_subfolder("ariba_summary", final_dir) ## subfolder_samples = functions.create_subfolder("samples", final_dir) ## TODO: Copy all xlsx files to a common folder. Is it necessary? ## open excel writer name_excel = final_dir + '/profile_summary.xlsx' writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') for database, data in outdir_samples.groupby(level='db'): ## fix report_files_databases = {} for sample, data2 in data.groupby(level='sample'): ## fix file_report = data2.loc[sample, database]['output'] + '/report.tsv' if os.path.isfile(file_report): ## check if exists report_files_databases[sample] = file_report outfile_summary = subfolder + "/" if database.endswith('card_prepareref/'): outfile_summary = outfile_summary + 'CARD_summary' name_db = 'CARD' elif database.endswith('vfdb_full_prepareref/'): outfile_summary = outfile_summary + 'VFDB_summary' name_db = 'VFDB' vfdb = True else: ## TODO: check if there are multiple 'other' databases ## Different databases provided (different to VFDB and CARD) would collapse file outfile_summary = outfile_summary + 'Other_summary' name_db = 'other' ## call ariba summary to summarize results csv_all = ariba_caller.ariba_summary_all(outfile_summary, report_files_databases) if not csv_all == 'NaN': csv2excel = pd.read_csv(csv_all, header=0, sep=',') ## write excel name_tab = name_db + '_found' csv2excel.to_excel(writer, sheet_name=name_tab) ## results_df contains excel and csv files for each sample and for each database list_databases = set(results_df['database'].to_list()) for db in list_databases: df_db = results_df[results_df['database'] == db]['csv'] dict_samples = df_db.to_dict() merge_df = pd.DataFrame() for sample in dict_samples: if os.path.isfile(dict_samples[sample]): df = pd.read_csv(dict_samples[sample], header=0, sep=",") df = df.set_index('Genes') df2 = df.rename(columns={'Status': sample}, inplace=True) df2 = df[[sample]] ## add to a common dataframe merge_df = pd.concat([merge_df, df2], axis=1, sort=True) merge_df.fillna("NaN", inplace=True) trans_df = merge_df.transpose() ## write excel name_tab = db + '_all' trans_df.to_excel(writer, sheet_name=name_tab) ## close writer.save() ###################################################### ## print additional information for VFDB ###################################################### if (vfdb): print("\n\n") HCGB_aes.print_sepLine("*", 50, False) print("+ Check VFDB details in files downloaded from vfdb website:") files_VFDB = virulence_resistance.check_VFDB(final_dir + '/VFDB_information') HCGB_aes.print_sepLine("*", 50, False) ###################################################### print("\n+ Please check additional summary files generated at folder ", final_dir) print("+ Go to website: https://jameshadfield.github.io/phandango/#/") print( "+ For each database upload files *phandango.csv and *phandango.tre and visualize results" )
def run_database(options): ## init time start_time_total = time.time() start_time_partial = start_time_total ## debugging messages global Debug if (options.debug): Debug = True print("[Debug mode: ON]") else: Debug = False ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Database") print("--------- Starting Process ---------") HCGB_time.print_time() kma_bin = set_config.get_exe("kma") ###################################################### ## print further information if requested if (options.help_ARIBA): print("ARIBA databases information:") ariba_caller.help_ARIBA() exit() elif (options.help_BUSCO): BUSCO_caller.print_help_BUSCO() exit() elif (options.help_KMA): species_identification_KMA.help_kma_database() exit() ###################################################### ## create folder ## absolute options.path = os.path.abspath(options.path) HCGB_files.create_folder(options.path) ######### if Debug: print(colored("DEBUG: absolute path folder: " + options.path, 'yellow')) ########## ## NCBI ## ########## ## if any NCBI options provided if any([options.ID_file, options.descendant]): ## create folders NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path) if (options.ID_file): ## get path and check if it is file abs_path_file = os.path.abspath(options.ID_file) if os.path.isfile(abs_path_file): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check NCBI ids provided ---------\n") HCGB_aes.print_sepLine("*", 70, False) ## get file information print("\t+ Obtaining information from file: %s" % abs_path_file) strains2get = HCGB_main.get_data(abs_path_file, ',', '') dataBase_NCBI = database_generator.NCBI_DB( strains2get, NCBI_folder, Debug) ######### if Debug: print(colored("DEBUG: NCBI data provided: ", 'yellow')) print(options.ID_file) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## strains downloaded would be included to a kma index ## Get all entries belonging to this taxon provided if (options.descendant): ######### if Debug: print(colored("DEBUG: NCBI descendant option: ON ", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print( "--------- Check descendant NCBI taxonomy ids provided ---------\n" ) HCGB_aes.print_sepLine("*", 70, False) ## [TODO] dataBase_NCBI = database_generator.NCBI_descendant( options.descendant, NCBI_folder, Debug) ############################################################## ## update KMA database with NCBI information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_NCBI['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db) print('+ Database to update: ', genbank_kma_db) species_identification_KMA.generate_db(list_of_files, 'genbank_KMA', genbank_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ############### ## user_data ## ############### if options.project_folder: ## dataBase_user = pd.DataFrame() ## get absolute path abs_project_folder = os.path.abspath(options.project_folder) if os.path.exists(abs_project_folder): ######### if Debug: print( colored("DEBUG: User provides folder containing project", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print("--------- Check user provided project folder ---------") HCGB_aes.print_sepLine("*", 70, False) dataBase_user = database_user.update_database_user_data( options.path, abs_project_folder, Debug, options) else: print( colored( "ERROR: Folder provided does not exists: %s" % options.project_folder, 'red')) exit() ############################################################## ## update KMA database with user_data information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_user['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) user_kma_db = HCGB_files.create_subfolder('user_data', kma_db) print('+ Database to update: ', user_kma_db) species_identification_KMA.generate_db(list_of_files, 'userData_KMA', user_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ########## ## ARIBA ########## print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check ARIBA parameters provided --------") HCGB_aes.print_sepLine("*", 50, False) if (options.no_ARIBA): print("+ No ARIBA databases would be downloaded...") ######### if Debug: print(colored("DEBUG: No option ARIBA", 'yellow')) else: #functions.print_sepLine("*",50, False) ### ariba list databases ariba_dbs_list = ['CARD', 'VFDB'] if (options.no_def_ARIBA): ariba_dbs_list = options.ariba_dbs else: if (options.ariba_dbs): ariba_dbs_list = ariba_dbs_list + options.ariba_dbs ariba_dbs_list = set(ariba_dbs_list) ######### if Debug: print(colored("DEBUG: Option ARIBA", 'yellow')) print(options.ariba_dbs) ariba_caller.download_ariba_databases(ariba_dbs_list, options.path, Debug, options.threads) ### ariba list databases if (options.ariba_users_fasta): print( "+ Generate ARIBA database for databases provided: prepare fasta and metadata information" ) ######### if Debug: print(colored("DEBUG: Option user ARIBA db", 'yellow')) print(ariba_users_fasta) print(ariba_users_meta) ## [TODO]: ## ariba prepareref fasta and metadata ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######### ## kma ## ######### print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check KMA parameters provided ----------") kma_database = options.path + '/KMA_db' HCGB_files.create_folder(kma_database) ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains ## downloads all "bacterial" genomes from KMA website ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/ print( "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website" ) ## KMA databases to use ## only user dbs if (options.no_def_kma): if (options.kma_dbs): print("+ Only user databases selected will be indexed...") else: print("+ No databases selected.") print(colored("ERROR: Please select a kma database.", 'red')) exit() ## default dbs + user else: kma_dbs = ["bacteria", "plasmids"] ## default dbs + user if (options.kma_dbs): options.kma_dbs = options.kma_dbs + kma_dbs options.kma_dbs = set(options.kma_dbs) else: options.kma_dbs = kma_dbs ######### if Debug: print(colored("DEBUG: options.kma_dbs", 'yellow')) print(options.kma_dbs) ## Get databases for db in options.kma_dbs: print(colored("\n+ " + db, 'yellow')) db_folder = HCGB_files.create_subfolder(db, kma_database) species_identification_KMA.download_kma_database(db_folder, db, Debug) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ########### ## BUSCO ## ########### if (options.BUSCO_dbs): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check BUSCO datasets provided ---------") BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path) ######### if Debug: print(colored("DEBUG: options.BUSCO_dbs", 'yellow')) print(options.BUSCO_dbs) print("+ BUSCO datasets would be downloaded when executed...") #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print("\n*************** Finish *******************\n") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Database module.\n") return ()
def run_cluster(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_Mash): ## information for Min Hash Software min_hash_caller.helpMash() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Clustering samples") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files if options.reads: if options.noTrim: ## raw reads pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: ## trimm reads pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## keep only R1 reads if paired-end if options.pair: pd_samples_retrieved = pd_samples_retrieved.loc[ pd_samples_retrieved['read_pair'] == "R1"] else: ## default pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) # exit if empty if pd_samples_retrieved.empty: print( "No data has been retrieved from the project folder provided. Exiting now..." ) exit() ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for each sample outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "mash", options.debug) ## debug message if (Debug): print(colored("**DEBUG: outdir_dict **", 'yellow')) print(outdir_dict) ## get databases to check retrieve_databases = get_options_db(options) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## remove samples if specified if options.ex_sample: ex_samples = HCGB_main.get_info_file(options.ex_sample) retrieve_databases = retrieve_databases.loc[~retrieve_databases.index. isin(ex_samples)] ## debug message if (Debug): print(colored("**DEBUG: retrieve_database **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print(retrieve_databases) ## check if all samples in user_data or genbank are indexed siglist_all = [] for index, row in retrieve_databases.iterrows(): if not row['path'] == 'NaN': if (Debug): HCGB_aes.print_sepLine("*", 25, False) print(row) if all([ int(options.kmer_size) == int(row['ksize']), int(options.n_sketch) == int(row['num_sketch']) ]): siglist_all.append( min_hash_caller.read_signature(row['path'], options.kmer_size)) continue ## index assembly or reads... (sigfile, siglist) = generate_sketch(row['folder'], row['original'], index, options.kmer_size, options.n_sketch, Debug) retrieve_databases.loc[index]['path'] = sigfile retrieve_databases.loc[index]['ksize'] = options.kmer_size retrieve_databases.loc[index]['num_sketch'] = options.n_sketch siglist_all.append(siglist) ### Cluster project samples print(colored("\n+ Collect project data", 'green')) print("+ Generate mash sketches for each sample analyzed...") pd_samples_retrieved = pd_samples_retrieved.set_index('name') ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieved **", 'yellow')) print(pd_samples_retrieved) ## init dataframe for project data colname = ["source", "name", "path", "original", "ksize", "num_sketch"] pd_samples_sketched = pd.DataFrame(columns=colname) for index, row in pd_samples_retrieved.iterrows(): if index in retrieve_databases.index: print( colored( '\t+ Sketched signature (%s) available within user data...' % index, 'yellow')) continue this_sig = outdir_dict[index] + '/' + index + '.sig' if os.path.exists(this_sig): ## File signature might exist ## read original file2print = outdir_dict[index] + '/.original' if not os.path.exists(file2print): original = ['NaN'] else: original = HCGB_main.readList_fromFile(file2print) if all([ int(options.kmer_size) == int(original[1]), int(options.n_sketch) == int(original[2]) ]): siglist_all.append( min_hash_caller.read_signature(this_sig, options.kmer_size)) pd_samples_sketched.loc[len(pd_samples_sketched)] = ( 'project_data', index, this_sig, row['sample'], options.kmer_size, options.n_sketch) print( colored( '\t+ Sketched signature available (%s) in project folder...' % index, 'green')) continue print( colored('\t+ Sketched signature to be generated: (%s)...' % index, 'yellow')) ## index assembly or reads... (sigfile, siglist) = generate_sketch(outdir_dict[index], row['sample'], index, options.kmer_size, options.n_sketch, Debug) pd_samples_sketched.loc[len(pd_samples_sketched)] = ('project_data', index, sigfile, row['sample'], options.kmer_size, options.n_sketch) siglist_all.append(siglist) print("\n+ Clustering sequences...") pd_samples_sketched = pd_samples_sketched.set_index('name') #### if retrieve_databases.empty: cluster_df = pd_samples_sketched else: tmp = retrieve_databases[[ 'source', 'db', 'path', 'original', 'ksize', 'num_sketch' ]] tmp = tmp.rename(columns={'db': 'name'}) tmp.set_index('name') if (Debug): print(colored("**DEBUG: tmp **", 'yellow')) print(tmp) ## merge both dataframes cluster_df = pd.concat([pd_samples_sketched, tmp], join='inner', sort=True) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_sketched **", 'yellow')) print(pd_samples_sketched) print(colored("**DEBUG: cluster_df **", 'yellow')) print(cluster_df) print(colored("**DEBUG: Signatures **", 'yellow')) print(siglist_all) print(colored("**DEBUG: length siglist_all **", 'yellow')) print(len(siglist_all)) ## Assign Colors colorLabels color_df = cluster_df.filter(["source"], axis=1) color_df["color"] = "r" ## red::genbank ## project data project_data = list(color_df[color_df["source"] == "project_data"].index) color_df.loc[color_df.index.isin(project_data), "color"] = "g" ## green::project_data ## user_data user_data = list(color_df[color_df["source"] == "user_data"].index) color_df.loc[color_df.index.isin(user_data), "color"] = "b" ## blue::user_data colorLabels = color_df['color'].to_dict() if Debug: print(color_df) print(colorLabels) ## parse results if options.project: outdir_report = HCGB_files.create_subfolder("report", outdir) #final_dir = outdir + '/report/cluster' final_dir = functions.create_subfolder("cluster", outdir_report) else: final_dir = outdir ## compare name = 'cluster_' + str(HCGB_time.create_human_timestamp()) tag_cluster_info = final_dir + '/' + name print('+ Saving results in folder: ', final_dir) print('\tFile name: ', name) (DataMatrix, labeltext) = min_hash_caller.compare(siglist_all, tag_cluster_info, Debug) ## get colorLabels ## plot images pdf = True cluster_returned = min_hash_caller.plot(DataMatrix, labeltext, tag_cluster_info, pdf, colorLabels) ## generate newick tree min_hash_caller.get_Newick_tree(cluster_returned, DataMatrix, labeltext, tag_cluster_info) return ()
def run_prep(options): """ Main function of the prep module. This module prepares fastq files for later usage. It initially checks the length of the name and advises the user to rename samples if exceeded. Along ``BacterialTyper`` there are a few string length limitations by different software that need to be sort out from the beginning of the process. This module allows to user to copy files into the project folder initiate or only link using a symbolic link to avoid duplicated raw data. See additional details of this module in user_guide :ref:`prep module entry<prep-description>`. .. seealso:: This function depends on other HCGB functions called: - :func:`HCGB.sampleParser` - :func:`HCGB.functions.aesthetics_functions` - :func:`HCGB.functions.time_functions` - :func:`HCGB.functions.main_functions` - :func:`HCGB.functions.file_functions` """ ## help_format option if (options.help_format): help_info.help_fastq_format() exit() HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Preparing samples") print ("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = os.path.abspath(options.output_folder) ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## Project mode as default project_mode=True if (options.detached): options.project = False project_mode=False else: options.project = True ## output folder print ("\n+ Create output folder(s):") HCGB_files.create_folder(outdir) ### info final_dir = "" if (options.project): print ("+ Generate a directory containing information within the project folder provided") final_dir = HCGB_files.create_subfolder("info", outdir) else: final_dir = outdir ## get files pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) ## Information returned in pd_samples_retrieved ### sample, dirname, name, name_len, lane, read_pair, lane_file, ext, gz if options.debug: HCGB_aes.debug_message("pd_samples_retrieved", "yellow") HCGB_main.print_all_pandaDF(pd_samples_retrieved) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## check character limitation list_lengths = pd_samples_retrieved.loc[:,'name_len'].to_list() if any(i > 10 for i in list_lengths): print (colored("\t ** Name lengths exceeds the 10 character limitation...", 'yellow')) if not (options.rename): print (colored("** ERROR: Rename files or provide --rename option...", 'red')) exit() ### rename files if (options.rename): options.rename = os.path.abspath(options.rename) if not HCGB_files.is_non_zero_file(options.rename): print (colored("** ERROR: File provided with rename information is not readable.", 'red')) print (options.rename) exit() names_retrieved = pd.read_csv(options.rename, sep=',', index_col=0, squeeze=True, header=None).to_dict() ## read csv to dictionary if (options.debug): HCGB_aes.debug_message("names_retrieved", "yellow") print (names_retrieved) ## TODO: check integrity of new names and special characters ## print to a file timestamp = time_functions.create_human_timestamp() rename_details = final_dir + '/' + timestamp + '_prep_renameDetails.txt' rename_details_hd = open(rename_details, 'w') ## rename files for index, row in pd_samples_retrieved.iterrows(): if (row['gz']): extension_string = row['ext'] + row['gz'] else: extension_string = row['ext'] if options.single_end: renamed = names_retrieved[row['name']] + '.' + extension_string else: renamed = names_retrieved[row['name']] + '_' + row['read_pair'] + '.' + extension_string ## modify frame pd_samples_retrieved.loc[index, 'new_file'] = renamed pd_samples_retrieved.loc[index, 'new_name'] = names_retrieved[row['name']] ## save in file string = row['sample'] + '\t' + renamed + '\n' rename_details_hd.write(string) if (options.debug): print (colored('** DEBUG: rename', 'yellow')) print ("Original: ", row['name']) print ("Renamed: ", names_retrieved[row['name']]) print ("File:", renamed) rename_details_hd.close() ##elif (options.single_end): It should work for both print ("+ Sample files have been renamed...") else: pd_samples_retrieved['new_file'] = pd_samples_retrieved['file'] ## create outdir for each sample outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "raw", options.debug) ## merge option if (options.merge): print ("+ Sample files will be merged...") ## TODO: check when rename option provided pd_samples_merged = sampleParser.merge.one_file_per_sample( pd_samples_retrieved, outdir_dict, options.threads, final_dir, options.debug) if (options.rename): print ("+ Merge files have been renamed...") else: print ("+ Sample files have been merged...") ## process is finished here print ("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print ("+ Exiting prep module.") exit() ## debugging messages if (options.debug): print (colored("** DEBUG: pd_samples_retrieved", 'yellow')) HCGB_main.print_all_pandaDF(pd_samples_retrieved) print (colored("** DEBUG: outdir_dict", 'yellow')) print (outdir_dict) ## copy or create symbolic link for files if (options.copy): print ("+ Sample files will be copied...") ## print to a file timestamp = HCGB_time.create_human_timestamp() copy_details = final_dir + '/' + timestamp + '_prep_copyDetails.txt' copy_details_hd = open(copy_details, 'w') else: print ("+ Sample files will be linked...") list_reads = [] for index, row in pd_samples_retrieved.iterrows(): if (options.copy): ## TODO: debug & set threads to copy faster shutil.copy(row['sample'], os.path.join(outdir_dict[row['new_name']], row['new_file'] )) string = row['sample'] + '\t' + os.path.join(outdir_dict[row['new_name']], row['new_file']) + '\n' copy_details_hd.write(string) else: list_reads.append(row['new_file']) if options.project: HCGB_files.get_symbolic_link_file(row['sample'], os.path.join(outdir_dict[row['new_name']], row['new_file'])) if (options.copy): print ("+ Sample files have been copied...") copy_details_hd.close() else: if not options.project: HCGB_files.get_symbolic_link(list_reads, outdir) print ("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print ("+ Exiting prep module.") return()