def run_assembly(options): """Main function of the assemble module. It assembles each sample using SPADES_ and checks quality using BUSCO_ software and database. .. seealso:: This function depends on other BacterialTyper and HCGB functions called: - :func:`BacterialTyper.scripts.BUSCO_caller.print_help_BUSCO` - :func:`BacterialTyper.scripts.multiQC_report.multiqc_help` - :func:`BacterialTyper.modules.qc.BUSCO_check` - :func:`HCGB.sampleParser` - :func:`HCGB.functions.aesthetics_functions` - :func:`HCGB.functions.time_functions` - :func:`HCGB.functions.main_functions` - :func:`HCGB.functions.file_functions` .. include:: ../../links.inc """ ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_info.help_fastq_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() exit() ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly module") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "assemble", options.debug) ### call assemble using spades start_time_partial = start_time_total start_time_partial_assembly = start_time_partial ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): HCGB_aes.debug_message("options.threads: " + str(options.threads), "yellow") HCGB_aes.debug_message("max_workers: " + str(max_workers_int), "yellow") HCGB_aes.debug_message("cpu_here: " + str(threads_job), "yellow") # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) # We can use a with statement to ensure threads are cleaned up promptly print('+ Running modules SPADES...') with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: ## send for each sample commandsSent = { executor.submit(check_sample_assembly, name, outdir_dict[name], sorted(cluster["sample"].tolist()), threads_job): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## functions.timestamp print("\n+ Assembly of all samples finished: ") start_time_partial = HCGB_time.timestamp(start_time_partial_assembly) ## if (assembly_stats): ################### if Debug: HCGB_aes.debug_message("assembly_stats dictionary", "yellow") print(assembly_stats) ## create single file get_assembly_stats_all(assembly_stats, outdir, Debug) ### symbolic links print("+ Retrieve all genomes assembled...") ### BUSCO check assembly if (options.no_BUSCO): print() else: results = qc.BUSCO_check(outdir, outdir, options, start_time_partial, "genome") ## print to file results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Assembly module.") return ()
def ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, start_time_partial): HCGB_aes.boxymcboxface("ARIBA Identification") ################## ## check status ## ################## databases2use = [] ## path, db name card_trick_info = "" print('+ Check databases status: ') for index, db2use in retrieve_databases.iterrows(): ## index_name if (db2use['source'] == 'ARIBA'): index_status = ariba_caller.check_db_indexed(db2use['path'], 'YES') if (index_status == True): #print (colored("\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green')) databases2use.append([db2use['path'], db2use['db']]) ## prepare card database ontology for later if (db2use['db'] == 'card'): card_trick_info = card_trick_caller.prepare_card_data( options.database) ## check status of other databases if any # else: ## debug message if (Debug): print(colored("**DEBUG: databases2use\n**", 'yellow')) print(databases2use) if (card_trick_info): print( colored("**DEBUG: card_trick_info: " + card_trick_info + " **", 'yellow')) ###################################################### ## Start identification of samples ###################################################### print("\n+ Send ARIBA identification jobs...") ## get outdir folders outdir_samples = pd.DataFrame(columns=('sample', 'dirname', 'db', 'output')) # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) for name, cluster in sample_frame: for db2use in databases2use: tmp = get_outfile(outdir_dict[name], name, db2use[0]) outdir_samples.loc[len(outdir_samples)] = (name, outdir_dict[name], db2use[1], tmp) ## multi-index outdir_samples = outdir_samples.set_index(['sample', 'db']) ## debug message if (Debug): print(colored("**DEBUG: outdir_samples **", 'yellow')) print(outdir_samples) ###################################################### ## send for each sample ###################################################### ## ariba assembly cutoff if not (options.ARIBA_cutoff): options.ARIBA_cutoff = 0.90 ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## loop results_df = pd.DataFrame() with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: for db2use in databases2use: print(colored("+ Working with database: " + db2use[1], 'yellow')) ## send for each sample commandsSent = { executor.submit( ariba_run_caller, db2use[0], db2use[1], ## database path & dbname sorted(cluster["sample"].tolist()), ## files outdir_samples.loc[(name, db2use[1]), 'output'], ## output threads_job, options.ARIBA_cutoff): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("+ Jobs finished for database %s ..." % db2use[1]) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print() print( "+ Collecting information for each sample analyzed for database: " + db2use[1]) ## check results for each database results_df_tmp = virulence_resistance.check_results( db2use[1], outdir_samples, options.ARIBA_cutoff, card_trick_info) results_df = pd.concat([results_df, results_df_tmp]) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ###################################################### ## Generate final report for all samples ###################################################### ## ariba summary results all samples print( "\n + Generate a summary file for all samples and one for each database employed..." ) ## parse results if Project: final_dir = input_dir + '/report/profile' HCGB_files.create_folder(final_dir) else: final_dir = os.path.abspath(options.output_folder) ## vfdb = False subfolder = HCGB_files.create_subfolder("ariba_summary", final_dir) ## subfolder_samples = functions.create_subfolder("samples", final_dir) ## TODO: Copy all xlsx files to a common folder. Is it necessary? ## open excel writer name_excel = final_dir + '/profile_summary.xlsx' writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') for database, data in outdir_samples.groupby(level='db'): ## fix report_files_databases = {} for sample, data2 in data.groupby(level='sample'): ## fix file_report = data2.loc[sample, database]['output'] + '/report.tsv' if os.path.isfile(file_report): ## check if exists report_files_databases[sample] = file_report outfile_summary = subfolder + "/" if database.endswith('card_prepareref/'): outfile_summary = outfile_summary + 'CARD_summary' name_db = 'CARD' elif database.endswith('vfdb_full_prepareref/'): outfile_summary = outfile_summary + 'VFDB_summary' name_db = 'VFDB' vfdb = True else: ## TODO: check if there are multiple 'other' databases ## Different databases provided (different to VFDB and CARD) would collapse file outfile_summary = outfile_summary + 'Other_summary' name_db = 'other' ## call ariba summary to summarize results csv_all = ariba_caller.ariba_summary_all(outfile_summary, report_files_databases) if not csv_all == 'NaN': csv2excel = pd.read_csv(csv_all, header=0, sep=',') ## write excel name_tab = name_db + '_found' csv2excel.to_excel(writer, sheet_name=name_tab) ## results_df contains excel and csv files for each sample and for each database list_databases = set(results_df['database'].to_list()) for db in list_databases: df_db = results_df[results_df['database'] == db]['csv'] dict_samples = df_db.to_dict() merge_df = pd.DataFrame() for sample in dict_samples: if os.path.isfile(dict_samples[sample]): df = pd.read_csv(dict_samples[sample], header=0, sep=",") df = df.set_index('Genes') df2 = df.rename(columns={'Status': sample}, inplace=True) df2 = df[[sample]] ## add to a common dataframe merge_df = pd.concat([merge_df, df2], axis=1, sort=True) merge_df.fillna("NaN", inplace=True) trans_df = merge_df.transpose() ## write excel name_tab = db + '_all' trans_df.to_excel(writer, sheet_name=name_tab) ## close writer.save() ###################################################### ## print additional information for VFDB ###################################################### if (vfdb): print("\n\n") HCGB_aes.print_sepLine("*", 50, False) print("+ Check VFDB details in files downloaded from vfdb website:") files_VFDB = virulence_resistance.check_VFDB(final_dir + '/VFDB_information') HCGB_aes.print_sepLine("*", 50, False) ###################################################### print("\n+ Please check additional summary files generated at folder ", final_dir) print("+ Go to website: https://jameshadfield.github.io/phandango/#/") print( "+ For each database upload files *phandango.csv and *phandango.tre and visualize results" )
def KMA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, time_partial): """Kmer identification using software KMA_. :param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in... :param pd_samples_retrieved: pandas dataframe for samples to process. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :param retrieve_databases: :param time_partial: timestamp of start time of the process. :type options: :type pd_samples_retrieved: pandas.DataFrame() :type outdir_dict: Dictionary :type retrieve_databases: pandas.DataFrame() :type time_partial: :return: Information of the identification. See example below. :rtype: pandas.DataFrame() See example of returned dataframe in file :file:`/devel/results/KMA_ident_example.csv` here: .. include:: ../../devel/results/KMA_ident_example.csv :literal: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.config.set_config.get_exe` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.modules.ident.send_kma_job` - :func:`BacterialTyper.modules.ident.get_outfile` - :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed` - :func:`BacterialTyper.scripts.species_identification_KMA.parse_kma_results` .. include:: ../../links.inc """ return (pd.DataFrame()) ### print header HCGB_aes.boxymcboxface("KMA Identification") ## set defaults kma_bin = set_config.get_exe("kma") ## check status databases2use = [] for index, db2use in retrieve_databases.iterrows(): ## index_name if (str(db2use['source']).startswith('KMA')): print('+ Check database: ' + db2use['db']) fold_name = os.path.dirname(db2use['path']) index_status = species_identification_KMA.check_db_indexed( db2use['path'], fold_name) if (index_status == True): print( colored( "\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green')) databases2use.append(db2use['path']) else: #databases2use.remove(db2use) print( colored( "\t**Databases %s is not correctly indexed. Not using it...\n" % db2use['db'], 'red')) ## debug message if (Debug): print( colored( "**DEBUG: databases2use\n" + "\n".join(databases2use) + "\n**", 'yellow')) ## Start identification of samples print("\n+ Send KMA identification jobs...") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: for db2use in databases2use: ## load database on memory print("+ Loading database on memory for faster identification.") return_code_load = species_identification_KMA.load_db( kma_bin, db2use) ## send for each sample commandsSent = { executor.submit(send_kma_job, outdir_dict[name], sorted(cluster["sample"].tolist()), name, db2use, threads_job, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## remove database from memory print("+ Removing database from memory...") return_code_rm = species_identification_KMA.remove_db( kma_bin, db2use) if (return_code_rm == 'FAIL'): print( colored( "***ERROR: Removing database from memory failed. Please do it manually! Execute command: %s" % cmd_rm_db, 'red')) ## functions.timestamp time_partial = HCGB_time.timestamp(time_partial) ## parse results print("+ KMA identification call finished for all samples...") print("+ Parse results now") results_summary = pd.DataFrame() for db2use in databases2use: ### [TODO]: parse data according to database: bacteria, plasmids or user data or genbank data provided basename_db = os.path.basename(db2use) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) ### for name, cluster in sample_frame: ## get result ## outdir_KMA outdir_dict_kma = HCGB_files.create_subfolder( "kma", outdir_dict[name]) result = get_outfile(outdir_dict_kma, name, db2use) #print ('\t- File: ' + result + '.spa') ## get results using a cutoff value [Defaulta: 80] results = species_identification_KMA.parse_kma_results( result + '.spa', options.KMA_cutoff) results['Database'] = basename_db ### check if db2use is plasmids as it could be several. if (results.index.size > 1): if (basename_db == "plasmids.T" or basename_db == "viral.TG"): ## let it be several entries results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) else: print( colored("###########################################", 'yellow')) print( colored("Sample %s contains multiple strains." % name, 'yellow')) print( colored("###########################################", 'yellow')) print(colored(results, 'yellow')) print('\n\n') ## add both strains if detected results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) ## TODO: add multi-isolate flag elif (results.index.size == 1): ## 1 clear reference results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) else: print( colored( '\tNo clear strain from database %s has been assigned to sample %s' % (basename_db, name), 'yellow')) ## add empty line if no available results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) print("+ Finish this step...") ## debug message if (Debug): results_summary.to_csv(quotechar='"') return (results_summary)
def map_samples(options, reference_gbk_file, input_dir, outdir): """ """ ## set it as variable contig_option = False pd_samples_retrieved_merge = pd.DataFrame() pd_samples_retrieved = pd.DataFrame() ## all_data // only_project_data if (options.all_data or options.only_project_data): ## get files to map pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "trim", ['_trim'], options.debug) ## discard the sample used as reference if any if options.project_sample_ID: pd_samples_retrieved = pd_samples_retrieved.drop(index=options.project_sample_ID) ## create output directories outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "phylo", options.debug) #################################### ## user_data // genbank_data // only_external_data if (options.all_data or options.user_data): ## --------------------------- ## ### get user database ## --------------------------- ## print ("+ Retrieve samples to map from user database...") db_frame_user_Data = database_user.get_userData_files(options, os.path.join(options.database, 'user_data')) ## discard the sample used as reference if any if options.user_sample_ID: #db_frame_user_Data = pd_samples_retrieved.drop(index=options.user_sample_ID) db_frame_user_Data = db_frame_user_Data.drop(index=options.user_sample_ID) ## Why not this? ## create output directories in database entries in user_data outdir_dict2 = HCGB_files.outdir_subproject(os.path.join(options.database, 'user_data'), db_frame_user_Data, "phylo") ## If user desires to map contigs, map trimmed as default if (contig_option): # filter for assemblies retrieved db_frame_user_Data = db_frame_user_Data.loc[db_frame_user_Data['tag'] == "assembly",] else: # filter for raw reads db_frame_user_Data = db_frame_user_Data.loc[db_frame_user_Data['tag'] == "reads",] ## merge if both contain data if not pd_samples_retrieved.empty: pd_samples_retrieved_merge = pd.concat([db_frame_user_Data, pd_samples_retrieved], sort=True, ignore_index=True).drop_duplicates() outdir_dict.update(outdir_dict2) else: outdir_dict = outdir_dict2 pd_samples_retrieved_merge = db_frame_user_Data ## --------------------------- ## ### get genbank database ## --------------------------- ## ## set contig option for these data only ## check data try: if db_frame_user_Data.empty: print () except: pd_samples_retrieved_merge = pd_samples_retrieved ## debug message if (Debug): print (colored("**DEBUG: pd_samples_retrieved_merge **", 'yellow')) print (pd_samples_retrieved_merge) print (colored("**DEBUG: outdir_dict **", 'yellow')) print (outdir_dict) ## TODO: use contig option if (contig_option or options.genbank_data): print () #################################### ## for fastq samples #################################### # optimize threads name_list = set(pd_samples_retrieved_merge["name"].tolist()) threads_job = HCGB_main.optimize_threads(options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads/threads_job) ## debug message if (options.debug): print (colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print (colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print (colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## call snippy print ("\n+ Create mapping of fastq reads for project samples:") # Group dataframe by sample name sample_frame = pd_samples_retrieved_merge.groupby(["name"]) ## send for each sample with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers_int) as executor: commandsSent = { executor.submit(snippy_variant_caller, reference_gbk_file, sorted(cluster["sample"].tolist()), threads_job, outdir_dict[name], options.name, contig_option, options.other_options, name, options.debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print ('***ERROR:') print (cmd2) print('%r generated an exception: %s' % (details, exc)) ## subfolder within phylo for this mapping new_outdir_dict = {} for key,value in outdir_dict.items(): tag = os.path.join(value, key + '_vs_' + options.name) new_outdir_dict[key] = tag return (new_outdir_dict)
def run_annotation(options): ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() elif (options.help_Prokka): ## information for Prokka annotation.print_list_prokka() exit() ## set default options.batch = False ### HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly annotation") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ### symbolic links print("+ Retrieve all genomes assembled...") ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for samples outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "annot", options.debug) ## annotate print("+ Annotate assemblies using prokka:") print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode") if options.genera == 'Other': print( "\t-Option: genera = Off; No genus-specific BLAST databases option provided" ) else: print("\t-Option: genera = ", options.genera, "; Genus-specific BLAST databases option provided") print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature") print("\t-Option: addmrna; Add 'mRNA' features for each 'CDS' feature") print("\t-Option: cdsrnaolap; Allow [tr]RNA to overlap CDS") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(annot_caller, row['sample'], outdir_dict[row['name']], options, row['name'], threads_job): index for index, row in pd_samples_retrieved.iterrows() } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## get folders givenList = [v for v in outdir_dict.values()] protein_files = [] print( "+ Detail information for each sample could be identified in separate folders:" ) for folder in givenList: print('\t + ', folder) protein_files.extend( HCGB_main.retrieve_matching_files(folder, '.faa', Debug)) ### report generation if (options.skip_report): print("+ No annotation report generation...") else: ### report generation HCGB_aes.boxymcboxface("Annotation report") outdir_report = HCGB_files.create_subfolder("report", outdir) PROKKA_report = HCGB_files.create_subfolder("annotation", outdir_report) print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## check if previously report generated filename_stamp = PROKKA_report + '/.success' done = 0 if os.path.isdir(PROKKA_report): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous report generated results on: %s" % stamp, 'yellow')) done = 1 ## generate report if done == 0: ## get subdirs generated and call multiQC report module multiQC_report.multiQC_module_call(givenList, "Prokka", PROKKA_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## success stamps filename_stamp = PROKKA_report + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) ## time stamp start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total) ## Check each annotation using BUSCO results = qc.BUSCO_check(input_dir, outdir, options, start_time_partial_BUSCO, "proteins") ## print to file: results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Annotation module.") return ()
def run(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_info.help_fastq_format() exit() elif (options.help_trimm_adapters): ## help on trimm adapters trimmomatic_call.print_help_adapters() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Trimming samples") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default if (options.detached): options.project = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) ## debug message if (Debug): HCGB_aes.debug_message("pd_samples_retrieved", 'yellow') HCGB_main.print_all_pandaDF(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for samples outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "trimm", options.debug) ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) print("+ Trimming adapters for each sample retrieved...") # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) # Trimming adapters if (options.adapters): # Adapter file provided options.adapters = os.path.abspath(options.adapters) print("\t- Adapters file provided...") else: # Get default adpaters file print("\t- Default Trimmomatic adapters (v0.39) will be used...") options.adapters = data_files.data_list( "available_Trimmomatic_adapters") ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(trimmo_caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, Debug, options.adapters): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("\n\n+ Trimming samples has finished...") ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_total) ## get files generated and generate symbolic link if not options.project: dir_symlinks = HCGB_files.create_subfolder('link_files', outdir) files2symbolic = [] folders = os.listdir(outdir) ## debug message if (Debug): print( colored( "**DEBUG: generate symbolic links for each file in " + dir_symlinks + "**", 'yellow')) for fold in folders: if fold.endswith(".log"): continue else: this_folder = outdir + '/' + fold subfiles = os.listdir(this_folder) for files in subfiles: files_search = re.search( r".*trim_R\d{1}.*", files) ## only paired-end. Todo: single end if files_search: files2symbolic.append(this_folder + '/' + files) HCGB_files.get_symbolic_link(files2symbolic, dir_symlinks) if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report using MultiQC module.") outdir_report = HCGB_files.create_subfolder("report", outdir) ## call multiQC report module givenList = [v for v in outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): HCGB_aes.debug_message("my_outdir_list for multiqc report", "yellow") print(my_outdir_list) print("\n") trimm_report = HCGB_files.create_subfolder("trimm", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "Trimmomatic", trimm_report, "") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % trimm_report) ## create fastqc for trimmed reads pd_samples_retrieved_trimmed = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, start_time_partial, "trimmed", Debug) print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("\n+ Exiting trimm module.") return ()
def run_biotype(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_XICRA.help_fastq_format() elif (options.help_project): ## information for project help_XICRA.project_help() exit() elif (options.help_RNAbiotype): ## information for join reads RNAbiotype.help_info() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True aesthetics_functions.pipeline_header('XICRA') aesthetics_functions.boxymcboxface("RNA biotype analysis") print("--------- Starting Process ---------") time_functions.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.detached): outdir = os.path.abspath(options.output_folder) options.project = False else: options.project = True outdir = input_dir ## get files print('+ Getting files from input folder... ') ## get files if options.noTrim: print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: print('+ Mode: trim.\n+ Extension: ') print("[ _trim_ ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## Discard if joined reads: use trimmed single-end or paired-end pd_samples_retrieved = pd_samples_retrieved[ pd_samples_retrieved['ext'] != '_joined'] ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: files_functions.create_folder(outdir) ## for samples mapping_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "map", options.debug) ## debug message if (Debug): print(colored("**DEBUG: mapping_outdir_dict **", 'yellow')) print(mapping_outdir_dict) # time stamp start_time_partial = time_functions.timestamp(start_time_total) ## optimize threads name_list = set(pd_samples_retrieved["new_name"].tolist()) threads_job = main_functions.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ############################################## ## map Reads ############################################## start_time_partial = mapReads_module(options, pd_samples_retrieved, mapping_outdir_dict, options.debug, max_workers_int, threads_job, start_time_partial, outdir) ## debug message if (Debug): print(colored("**DEBUG: mapping_results **", 'yellow')) print(mapping_results) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) ## for samples biotype_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "biotype", options.debug) ## debug message if (Debug): print(colored("**DEBUG: biotype_outdir_dict **", 'yellow')) print(biotype_outdir_dict) ## get RNAbiotype information RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict, options.annotation, options.debug, max_workers_int, threads_job) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print( "\n+ Generating a report using MultiQC module for featureCount analysis." ) outdir_report = files_functions.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in biotype_outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") featureCount_report = files_functions.create_subfolder( "featureCount", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "featureCount", featureCount_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % featureCount_report) ### Summarizing RNA biotype information biotype_report = files_functions.create_subfolder( "biotype", outdir_report) single_files_biotype = files_functions.create_subfolder( "samples", biotype_report) ## results dict_files = {} for samples in biotype_outdir_dict: featurecount_file = os.path.join(biotype_outdir_dict[samples], 'featureCount.out.tsv') if files_functions.is_non_zero_file(featurecount_file): dict_files[samples] = featurecount_file ## copy pdf pdf_plot = main_functions.retrieve_matching_files( biotype_outdir_dict[samples], '.pdf', options.debug) if files_functions.is_non_zero_file(pdf_plot[0]): shutil.copy(pdf_plot[0], single_files_biotype) ## collapse all information all_data = RNAbiotype.generate_matrix(dict_files) ## print into excel/csv print('+ Table contains: ', len(all_data), ' entries\n') ## debugging messages if Debug: print("** DEBUG: all_data") print(all_data) ## set abs_csv_outfile to be in report folder ## copy or link files for each sample analyzed abs_csv_outfile = os.path.join(biotype_report, "summary.csv") all_data.to_csv(abs_csv_outfile) ## create plot: call R [TODO: implement in python] outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf") ## R scripts biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum', options.debug) rscript = set_config.get_exe("Rscript", options.debug) cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script, abs_csv_outfile, outfile_pdf) ## print("+ Create summary plot for all samples") callCode = system_call_functions.system_call(cmd_R_plot) print("\n*************** Finish *******************") start_time_partial = time_functions.timestamp(start_time_total) print("\n+ Exiting join module.") return ()
def fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysis, Debug): HCGB_aes.boxymcboxface("FASTQC Quality check for samples") ## debug message if (Debug): print(colored("\n**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) print("\n") ## generate output folder, if necessary print("\n+ Create output folder(s):") ## if not project, outdir contains the dir to put output ## in this case, in some other cases might not occur if not options.project: functions.create_folder(outdir) outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "fastqc_" + name_analysis, options.debug) print("+ Checking quality for each sample retrieved...") start_time_partial = start_time_total # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): HCGB_aes.debug_message("options.threads: " + str(options.threads), "yellow") HCGB_aes.debug_message("max_workers: " + str(max_workers_int), "yellow") HCGB_aes.debug_message("threads_job: " + str(threads_job), "yellow") ## send for each sample print("+ Calling fastqc for samples...") with concurrent.futures.ThreadPoolExecutor( max_workers=int(max_workers_int)) as executor: commandsSent = { executor.submit(fastqc_caller.run_module_fastqc, outdir_dict[name], sorted(cluster["sample"].tolist()), name, threads_job): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("+ FASTQC for samples has finished...") ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report using MultiQC module.") outdir_report = HCGB_files.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") fastqc_report = HCGB_files.create_subfolder("FASTQC", outdir_report) fastqc_final_report = HCGB_files.create_subfolder( name_analysis, fastqc_report) multiQC_report.multiQC_module_call(my_outdir_list, "FASTQC", fastqc_final_report, "") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % fastqc_final_report) print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting qc module.") exit()
def update_database_user_data(database_folder, project_folder, Debug, options): """ Updates user_data folder within the database folder provided. It would generate single subfolder for each sample previously analyzed and it would store main information and result files for later interpretation, comparison and/or summarization with new samples analyzed. :param database_folder: :param project_folder: :param Debug: :param options: :type database_folder: :type project_folder: :type Debug: :type options: :returns: Updated database result from :func:`BacterialTyper.scripts.database_generator.update_db_data_file`. :rtype: Dataframe :warnings: Returns **FAIL** if check process failed. .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.files_functions.create_subfolder` - :func:`HCGB.functions.main_functions.functions.get_data` - :func:`HCGB.functions.main_functions.optimize_threads` - :func:`BacterialTyper.scripts.database_user.get_userData_files` - :func:`BacterialTyper.scripts.database_user.update_sample` - :func:`BacterialTyper.scripts.database_generator.getdbs` - :func:`BacterialTyper.scripts.database_generator.get_database` - :func:`BacterialTyper.scripts.database_generator.update_db_data_file` """ print("\n+ Updating information from user data folder: ", project_folder) ## create folder own_data = HCGB_files.create_subfolder("user_data", database_folder) ## Default missing options options.project = True options.debug = Debug if not options.single_end: options.pair = True #################################### ## get information #################################### ## get user data files project_data_df = get_userData_files(options, project_folder) ## get user data info project_info_df = get_userData_info(options, project_folder) ## merge data project_all_data = pd.concat([project_data_df, project_info_df], join='outer', sort=True).drop_duplicates() #project_all_data.index.name = 'name' ## debug messages: if Debug: HCGB_aes.debug_message("project_data_df", 'yellow') print(project_data_df) HCGB_aes.debug_message("project_info_df", 'yellow') print(project_info_df) HCGB_aes.debug_message("project_all_data", 'yellow') print(project_all_data) print('\n+ Get database information') db_frame = database_generator.getdbs('user_data', database_folder, 'user_data', Debug) user_data_db = database_generator.get_database(db_frame, Debug) ## merge dataframe sample_frame = project_all_data.groupby("name") #################################### ## optimize threads #################################### name_list = project_all_data.index.values.tolist() threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) print('\n+ Updating information using %s threads and %s parallel jobs' % (options.threads, max_workers_int)) #################################### ## loop through frame using multiple threads #################################### with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: ## send for each commandsSent = { executor.submit(update_sample, name, cluster, own_data, user_data_db, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) HCGB_aes.print_sepLine("+", 75, False) print("\n+ Retrieve information ...") #################################### ###### populate dataframe #################################### for name, cluster in sample_frame: ###### dump to file info_file = own_data + '/' + name + '/info.txt' if os.path.exists(info_file): dataGot = HCGB_main.get_data(info_file, ',', 'index_col=0') dataGot = dataGot.set_index('ID') if (options.debug): print(colored("**DEBUG: dataGot dataframe **", 'yellow')) print(dataGot) user_data_db = pd.concat([user_data_db, dataGot], join='outer', sort=True).drop_duplicates() ## concatenating by outer we get all available entries if (options.debug): print(colored("**DEBUG: user_data_db dataframe **", 'yellow')) print(user_data_db) HCGB_aes.print_sepLine("+", 75, False) #################################### ## update db #################################### database_csv = own_data + '/user_database.csv' dataUpdated = database_generator.update_db_data_file( user_data_db, database_csv) print("+ Database has been generated: \n", database_csv) return (dataUpdated)