def get_outfile(output_dir, name, index_name): ## ## basename_tag = index_name.split("_prepareref/")[0] basename = os.path.basename(basename_tag) if Project: out_file = output_dir + '/' + basename else: output_path = HCGB_files.create_subfolder(name, output_dir) out_file = output_path + '/' + name + '_' + basename ## message debug if (Debug): print( colored( "**DEBUG: Input names " + name + '\n' + output_dir + '\n' + index_name + "\n", 'yellow')) print( colored( "**DEBUG: Output names \n" + basename + '\n' + basename_tag + '\n' + out_file + " **\n", 'yellow')) return (out_file)
def BUSCO_plots(dataFrame_results, outdir, threads): ## DataFrame columns ('sample', 'dirname', 'name', 'ext', 'tag', 'busco_folder', 'busco_dataset', 'busco_summary', 'busco_results')) list_datasets = set(dataFrame_results['busco_dataset'].tolist()) list_samples = set(dataFrame_results['name'].tolist()) plot_folder = HCGB_files.create_subfolder('BUSCO_plots', outdir) outdir_busco_plot = [] ## summary for dataset print ("+ Get results for all samples summarized by dataset:") for dataset in list_datasets: print ("\t+ Get results for: ", dataset) plot_folder_dataset = HCGB_files.create_subfolder(dataset, plot_folder) outdir_busco_plot.append(plot_folder_dataset) for index, row in dataFrame_results.iterrows(): if (dataset == row['busco_dataset']): shutil.copy(row['busco_summary'], plot_folder_dataset + '/short_summary.specific.' + dataset + '.' + row['name'] + '.txt') print ("+ Get results for summarized by sample:") for sample in list_samples: print ("\t+ Get results for: ", sample) plot_folder_sample = HCGB_files.create_subfolder(sample, plot_folder) outdir_busco_plot.append(plot_folder_sample) for index, row in dataFrame_results.iterrows(): if (sample == row['name']): shutil.copy(row['busco_summary'], plot_folder_sample + '/short_summary.specific.' + dataset + '.' + row['name'] + '.txt') #plot_folder_sample + '/short_summary.' + row['busco_dataset'] + '.' + sample + '.txt') print ("+ Generate plots for each subset") path_here = os.getcwd() for plot in outdir_busco_plot: BUSCO_plot(plot) print ("+ All plots generated...") print ("+ Check results under folders in : ", plot_folder) os.chdir(path_here) return()
def load_Genome(folder, STAR_exe, genomeDir, num_threads): ## --genomeLoad LoadAndExit Load_folder = files_functions.create_subfolder('LoadMem', folder) cmd_LD = "%s --genomeDir %s --runThreadN %s --outFileNamePrefix %s --genomeLoad LoadAndExit" % ( STAR_exe, genomeDir, num_threads, Load_folder) print('\t+ Loading memory for STAR mapping') load_code = system_call_functions.system_call(cmd_LD, False, True) return (load_code)
def remove_Genome(STAR_exe, genomeDir, folder, num_threads): ## --genomeLoad Remove remove_folder = files_functions.create_subfolder('RemoveMem', folder) cmd_RM = "%s --genomeDir %s --outFileNamePrefix %s --runThreadN %s --genomeLoad Remove" % ( STAR_exe, genomeDir, remove_folder, num_threads) ## send command print('\t+ Removing memory loaded for STAR mapping') remove_code = system_call_functions.system_call(cmd_RM, False, True) return (remove_code)
def create_genomeDir(folder, STAR_exe, num_threads, fasta_file, limitGenomeGenerateRAM): ## genomeDir = files_functions.create_subfolder("STAR_index", folder) cmd_create = "%s --runMode genomeGenerate --limitGenomeGenerateRAM %s --runThreadN %s --genomeDir %s --genomeFastaFiles %s" % ( STAR_exe, limitGenomeGenerateRAM, num_threads, genomeDir, fasta_file) print('\t+ genomeDir generation for STAR mapping') create_code = system_call_functions.system_call(cmd_create, False, True) if not create_code: print("** ERROR: Some error ocurred during genomeDir creation... **") exit() return (genomeDir)
def prepare_card_data(database_folder): ## create CARD folder abs_folder = os.path.abspath(database_folder) CARD_folder = HCGB_files.create_subfolder('CARD', abs_folder) ## make stamp time filename_stamp = CARD_folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [CARD Ontology Data]" %stamp, 'yellow')) ## check time passed days_passed = HCGB_time.get_diff_time(filename_stamp) print ("\t** %s days ago" %days_passed) if (days_passed > 30): ## download again print ("\t ** Downloading information again just to be sure...") download=True else: print ("\t ** No need to download data again.") download=False else: download=True ### if download: ## uptade database in a path aro_obo_file = card_trick.ontology_functions.update_ontology(CARD_folder, False) ## get ontology and save it in csv return_frame = card_trick.ontology_functions.parse_ontology(aro_obo_file, False) ### if success return folder name if not return_frame.empty: ## success stamps filename_stamp = CARD_folder + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) else: return (FAIL) ## return folder name return(CARD_folder)
def snippy_variant_caller(reference, files, threads, outdir, name, contig_option, other_options, sample_name, Debug): ## create subfolder within phylo for this mapping tag = sample_name + '_vs_' + name subdir = HCGB_files.create_subfolder(tag, outdir) ## check if previously process and succeeded filename_stamp = subdir + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, tag), 'yellow')) else: # Call variant calling code = variant_calling.snippy_call(reference, files, threads, subdir, sample_name, contig_option, other_options, Debug) if code == 'OK': stamp = HCGB_time.print_time_stamp(filename_stamp) return(code)
def get_outfile(output_dir, name, index_name): """ Generates the name for the output file created :param output_dir: Absolute path to results folder :param name: Name of the sample :param index_name: Name of the database :type output_dir: string :type name: string :type index_name: string :retruns: Output file absolute path """ basename_tag = os.path.basename(index_name) if Project: output_path = output_dir else: output_path = HCGB_files.create_subfolder(name, output_dir) out_file = output_path + '/' + name + '_' + basename_tag return (out_file)
def download_ariba_databases(list_dbs, main_folder, Debug, threads): """Download ARIBA_ databases. Using ARIBA software this function retrieves desired databases and prepare them for later analysis. :param list_dbs: List of databases to download. :param main_folder: Absolute path to database folder. :param Debug: True/false for printing developer messages :param threads: Number of CPUs to use. :type list_dbs: string :type main_folder: string :type Debug: Boolean :type threads: integer .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.file_functions.create_subfolder` - :func:`HCGB.functions.time_functions.read_time_stamp` - :func:`BacterialTyper.scripts.ariba_caller.get_ARIBA_dbs` - :func:`BacterialTyper.scripts.ariba_caller.ariba_getref` .. include:: ../../links.inc """ print("\n\n+ Download databases for Antimicrobial Resistance Identification By Assembly (ARIBA).") ariba_folder = HCGB_files.create_subfolder("ARIBA", main_folder) ## print ARIBA databases: print ("+ Available databases:") dbs = get_ARIBA_dbs(list_dbs) for db_set in dbs: HCGB_aes.print_sepLine("-",30, False) print (colored("+ " + db_set,'yellow')) ## prepare folders folder_set = HCGB_files.create_subfolder(db_set, ariba_folder) outdir_prepare_ref = folder_set + '_prepareref' ## stamp time file filename_stamp_prepare = outdir_prepare_ref + '/.success' ## check if previously done if os.path.isfile(filename_stamp_prepare): stamp = HCGB_time.read_time_stamp(filename_stamp_prepare) print ("\t+ Database is downloaded in folder: ", folder_set) print ("\t+ Data is available and indexed in folder: ", outdir_prepare_ref) print (colored("\tDatabase was previously downloaded and prepared on: %s" %stamp, 'yellow')) ## Check if necessary to download again after several months/days days_passed = HCGB_time.get_diff_time(filename_stamp_prepare) print ("\t\t** %s days ago" %days_passed) if (days_passed > 30): ## download again print ("\t\t** Downloading information again just to be sure...") return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads) else: return_ariba_getref = 'OK' else: return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads) if (return_ariba_getref == 'OK'): print() else: print (colored("** ARIBA getref failed or generated a warning for " + db_set, 'red'))
def run_database(options): ## init time start_time_total = time.time() start_time_partial = start_time_total ## debugging messages global Debug if (options.debug): Debug = True print("[Debug mode: ON]") else: Debug = False ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Database") print("--------- Starting Process ---------") HCGB_time.print_time() kma_bin = set_config.get_exe("kma") ###################################################### ## print further information if requested if (options.help_ARIBA): print("ARIBA databases information:") ariba_caller.help_ARIBA() exit() elif (options.help_BUSCO): BUSCO_caller.print_help_BUSCO() exit() elif (options.help_KMA): species_identification_KMA.help_kma_database() exit() ###################################################### ## create folder ## absolute options.path = os.path.abspath(options.path) HCGB_files.create_folder(options.path) ######### if Debug: print(colored("DEBUG: absolute path folder: " + options.path, 'yellow')) ########## ## NCBI ## ########## ## if any NCBI options provided if any([options.ID_file, options.descendant]): ## create folders NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path) if (options.ID_file): ## get path and check if it is file abs_path_file = os.path.abspath(options.ID_file) if os.path.isfile(abs_path_file): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check NCBI ids provided ---------\n") HCGB_aes.print_sepLine("*", 70, False) ## get file information print("\t+ Obtaining information from file: %s" % abs_path_file) strains2get = HCGB_main.get_data(abs_path_file, ',', '') dataBase_NCBI = database_generator.NCBI_DB( strains2get, NCBI_folder, Debug) ######### if Debug: print(colored("DEBUG: NCBI data provided: ", 'yellow')) print(options.ID_file) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## strains downloaded would be included to a kma index ## Get all entries belonging to this taxon provided if (options.descendant): ######### if Debug: print(colored("DEBUG: NCBI descendant option: ON ", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print( "--------- Check descendant NCBI taxonomy ids provided ---------\n" ) HCGB_aes.print_sepLine("*", 70, False) ## [TODO] dataBase_NCBI = database_generator.NCBI_descendant( options.descendant, NCBI_folder, Debug) ############################################################## ## update KMA database with NCBI information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_NCBI['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db) print('+ Database to update: ', genbank_kma_db) species_identification_KMA.generate_db(list_of_files, 'genbank_KMA', genbank_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ############### ## user_data ## ############### if options.project_folder: ## dataBase_user = pd.DataFrame() ## get absolute path abs_project_folder = os.path.abspath(options.project_folder) if os.path.exists(abs_project_folder): ######### if Debug: print( colored("DEBUG: User provides folder containing project", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print("--------- Check user provided project folder ---------") HCGB_aes.print_sepLine("*", 70, False) dataBase_user = database_user.update_database_user_data( options.path, abs_project_folder, Debug, options) else: print( colored( "ERROR: Folder provided does not exists: %s" % options.project_folder, 'red')) exit() ############################################################## ## update KMA database with user_data information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_user['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) user_kma_db = HCGB_files.create_subfolder('user_data', kma_db) print('+ Database to update: ', user_kma_db) species_identification_KMA.generate_db(list_of_files, 'userData_KMA', user_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ########## ## ARIBA ########## print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check ARIBA parameters provided --------") HCGB_aes.print_sepLine("*", 50, False) if (options.no_ARIBA): print("+ No ARIBA databases would be downloaded...") ######### if Debug: print(colored("DEBUG: No option ARIBA", 'yellow')) else: #functions.print_sepLine("*",50, False) ### ariba list databases ariba_dbs_list = ['CARD', 'VFDB'] if (options.no_def_ARIBA): ariba_dbs_list = options.ariba_dbs else: if (options.ariba_dbs): ariba_dbs_list = ariba_dbs_list + options.ariba_dbs ariba_dbs_list = set(ariba_dbs_list) ######### if Debug: print(colored("DEBUG: Option ARIBA", 'yellow')) print(options.ariba_dbs) ariba_caller.download_ariba_databases(ariba_dbs_list, options.path, Debug, options.threads) ### ariba list databases if (options.ariba_users_fasta): print( "+ Generate ARIBA database for databases provided: prepare fasta and metadata information" ) ######### if Debug: print(colored("DEBUG: Option user ARIBA db", 'yellow')) print(ariba_users_fasta) print(ariba_users_meta) ## [TODO]: ## ariba prepareref fasta and metadata ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######### ## kma ## ######### print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check KMA parameters provided ----------") kma_database = options.path + '/KMA_db' HCGB_files.create_folder(kma_database) ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains ## downloads all "bacterial" genomes from KMA website ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/ print( "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website" ) ## KMA databases to use ## only user dbs if (options.no_def_kma): if (options.kma_dbs): print("+ Only user databases selected will be indexed...") else: print("+ No databases selected.") print(colored("ERROR: Please select a kma database.", 'red')) exit() ## default dbs + user else: kma_dbs = ["bacteria", "plasmids"] ## default dbs + user if (options.kma_dbs): options.kma_dbs = options.kma_dbs + kma_dbs options.kma_dbs = set(options.kma_dbs) else: options.kma_dbs = kma_dbs ######### if Debug: print(colored("DEBUG: options.kma_dbs", 'yellow')) print(options.kma_dbs) ## Get databases for db in options.kma_dbs: print(colored("\n+ " + db, 'yellow')) db_folder = HCGB_files.create_subfolder(db, kma_database) species_identification_KMA.download_kma_database(db_folder, db, Debug) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ########### ## BUSCO ## ########### if (options.BUSCO_dbs): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check BUSCO datasets provided ---------") BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path) ######### if Debug: print(colored("DEBUG: options.BUSCO_dbs", 'yellow')) print(options.BUSCO_dbs) print("+ BUSCO datasets would be downloaded when executed...") #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print("\n*************** Finish *******************\n") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Database module.\n") return ()
def run_report(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_spaTyper): ## help_format option get_spa_typing.help_spaTyper() exit() elif (options.help_project): ## information for project help_info.project_help() exit() ## set default options.batch = False ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Report generation module") print("--------- Starting Process ---------") HCGB_time.print_time() ## call assemble using spades start_time_partial = start_time_total ## absolute path for in & out options.database = os.path.abspath(options.database) global input_dir input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached global Project if (options.detached): options.project = False outdir = os.path.abspath(options.output_folder) Project = False else: options.project = True outdir = input_dir Project = True ## print("\n+ Get project information:") ## get files: trimm, assembly, annotation pd_samples_retrieved = database_user.get_userData_files(options, input_dir) pd_samples_retrieved['new_name'] = pd_samples_retrieved['name'] ## get info: profile, ident, cluster, MGE pd_samples_info = database_user.get_userData_info(options, input_dir) ## get databases to list #retrieve_databases = get_options_db(options) ## create output files outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "report", options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) print(colored("**DEBUG: pd_samples_info **", 'yellow')) print(pd_samples_info) ## generate output folder, if necessary print( "\n\n\n+ Generate a report summarizing analysis and sample information" ) if not options.project: HCGB_files.create_folder(outdir) outdir_report = outdir else: ### report generation outdir_report = HCGB_files.create_subfolder("report", outdir) ## create report with all data summary_report = HCGB_files.create_subfolder("summary_report", outdir_report) print("Folder: ", summary_report) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######################################## ## create species specific report if any ######################################## if (options.species_report): ## Saureus if options.species_report == "Saureus": Saureus_specific(pd_samples_retrieved, pd_samples_info, options, summary_report, outdir_dict) ## else ## to add accordingly ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ########################################################### ## create gene fasta sequences retrieval if desired ########################################################### if options.genes_ids_fasta: ## given a list of genes ids, retrieve sequence for all samples from profile if os.path.isfile(os.path.abspath(options.genes_ids_fasta)): in_file = os.path.abspath(options.genes_ids_fasta) gene_names = [line.rstrip('\n') for line in open(in_file)] print( '+ Retrieve selected genes sequences from the profile analysis for each sample.' ) print('+ Searching gene:') ## get profiles available results_geneIDs = pd.DataFrame(columns=('sample', 'gene', 'id', 'sequence')) sample_frame = pd_samples_info.groupby(["name"]) for g in gene_names: print("\t+", g) for name, cluster_df in sample_frame: my_list_profiles = cluster_df.loc[ cluster_df['tag'] == 'profile']['ext'].to_list() if options.debug: print("name: ", name) print("my_list_profiles:") print(my_list_profiles) for p in my_list_profiles: main_profile_folder = cluster_df.loc[ cluster_df['ext'] == p]['dirname'].to_list()[0] p = p.lower() if p == 'vfdb': p = p + '_full' profile_folder = os.path.join(main_profile_folder, p) (seq_id, seq_sequence ) = retrieve_genes.retrieve_genes_ids_sequences( profile_folder, g, Debug) if (seq_id): ## save results results_geneIDs.loc[len(results_geneIDs)] = ( name, g, seq_id, seq_sequence) ## save for each gene in a separate fasta file list_of_genes = set(results_geneIDs['gene'].to_list()) ## debug if Debug: print("** DEBUG **") print(results_geneIDs) print(list_of_genes) ## Save results genes_folder = HCGB_files.create_subfolder('genes', summary_report) for gene_retrieved in list_of_genes: this_frame = results_geneIDs[results_geneIDs['gene'] == gene_retrieved] gene_retrieved_file = os.path.join(genes_folder, gene_retrieved) gene_retrieved_fasta = gene_retrieved_file + ".fasta" gene_retrieved_info = gene_retrieved_file + "_info.txt" fasta_hd = open(gene_retrieved_fasta, 'w') info_hd = open(gene_retrieved_info, 'w') for item, row in this_frame.iterrows(): string2write = ">" + row['sample'] + '_' + row[ 'gene'] + '\n' + row['sequence'] + '\n' string2write_info = row['sample'] + '\t' + row[ 'gene'] + '\t' + row['id'] + '\n' fasta_hd.write(string2write) info_hd.write(string2write_info) fasta_hd.close() info_hd.close() ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######################################## ## create gene promoter fasta sequences retrieval if desired ######################################## if options.promoter_bp: ## retrieve as many bp as necessary from genes_ids_fasta print("** THIS OPTION IS NOT IMPLEMENTED YET... **") #get_promoter.get_promoter(file, geneOfInterest, basePairs, sampleName, option, debug=False): ######################################## ## create gene specific report if any ######################################## if options.genes_ids_profile: if options.species_report == "Saureus": if Debug: print("** options.genes_ids_profile **") print("Analysis already done for Saureus") else: in_file = os.path.abspath(options.genes_ids_profile) gene_names = [line.rstrip('\n') for line in open(in_file)] results_Profiles = retrieve_genes.get_genes_profile( pd_samples_info, gene_names, options.debug, "name") if options.debug: print("results_Profiles") print(results_Profiles) ## open excel writer name_excel = summary_report + '/gene_ids_profile.xlsx' writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') results_Profiles.to_excel(writer, sheet_name="gene_ids") ## close writer.save() ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ############################################### ## Search for any additional fasta sequence ############################################### if options.genes_fasta: ## given a list of fasta sequences search using blast against proteins annotated or genome print("** THIS OPTION IS NOT IMPLEMENTED YET... **") print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Report generation module.") return ()
def get_reference_gbk(options): #################### ## Genbank_ID #################### reference_gbk_file = "" if options.Genbank_ID: db_frame_ncbi = database_generator.getdbs('NCBI', options.database, 'genbank', options.debug) ## debug message if (options.debug): print (colored("**DEBUG: db_frame_ncbi **", 'yellow')) print (db_frame_ncbi) NCBI_folder = HCGB_files.create_subfolder('NCBI', options.database) dir_path = os.path.join(NCBI_folder, 'genbank', 'bacteria', options.Genbank_ID) if (options.Genbank_ID in db_frame_ncbi.index): print('\t+ Reference (%s) available in database provided' %options.Genbank_ID) else: print ('\t+ Reference (%s) is not available in database provided' %options.Genbank_ID) print ('\t+ Try to download it.') database_generator.ngd_download(dir_path, options.Genbank_ID, NCBI_folder) ## get files download (genome, prot, gff, gbk) = database_generator.get_files_download(dir_path) if options.debug: print (colored("**DEBUG: genome:" + genome, 'yellow')) print (colored("**DEBUG: prot:" + prot, 'yellow')) print (colored("**DEBUG: gff:" + gff, 'yellow')) print (colored("**DEBUG: gbk:" + gbk, 'yellow')) if HCGB_files.is_non_zero_file(gbk): print('\t+ Genbank file format reference available.') reference_gbk_file = gbk else: print(colored('\n+ No genbank file available for the reference specified. Some error occurred while downloading', 'red')) exit() #################### ## user_sample_ID #################### elif options.user_sample_ID: db_frame_user_Data = database_user.get_userData_files(options, os.path.join(options.database, 'user_data')) df_data = db_frame_user_Data.groupby('name') try: this_sample_df = df_data.get_group(options.project_sample_ID) print('\t+ Reference (%s) available in database folder provided' %options.user_sample_ID) except: print (colored('** WARNING: Reference (%s) not available in database folder provided' %options.user_sample_ID, 'yellow')) print ('\t+ Lets try to update the database first.') db_frame_user_dataUpdated = database_user.update_database_user_data(options.database, input_dir, options.debug, options) df_data = db_frame_user_dataUpdated.groupby('name') try: this_sample_df = df_data.get_group(options.user_sample_ID) print('\t+ Reference (%s) available in database updated' %options.user_sample_ID) db_frame_user_Data = db_frame_user_dataUpated except: print(colored('\n** ERROR: No reference (%s) available in database updated. Some error occurred...' %options.user_sample_ID, 'red')) exit() ## debug message if (options.debug): print (colored("**DEBUG: db_frame_user_Data **", 'yellow')) print (db_frame_user_Data) print (colored("**DEBUG: this_sample_df (groupby name)**", 'yellow')) print (this_sample_df) ## get gbk file gbk = this_sample_df.loc[ this_sample_df['ext']=='gbf','sample'].values[0] ## debug if options.debug: print ("** DEBUG: this_sample_df") print (this_sample_df) print ('gbk:' + gbk) ## check if exists if HCGB_files.is_non_zero_file(gbk): print('\t+ Genbank file format reference available.') reference_gbk_file = gbk else: print(colored('\n** ERROR: No genbank file available for the reference specified. Some error occurred while downloading', 'red')) exit() #################### ## project_sample_ID #################### elif options.project_sample_ID: db_frame_project_Data = database_user.get_userData_files(options, options.input) df_data = db_frame_project_Data.groupby('name') try: this_sample_df = df_data.get_group(options.project_sample_ID) print('\t+ Reference (%s) available in project folder provided' %options.project_sample_ID) except: print (colored('** ERROR: Reference (%s) not available in project folder provided' %options.project_sample_ID, 'red')) print ('\t+ Check the spelling or provide a valid ID.') exit() ## debug message if (options.debug): print (colored("**DEBUG: db_frame_project_Data **", 'yellow')) print (db_frame_project_Data) print (colored("**DEBUG: this_sample_df (groupby name)**", 'yellow')) print (this_sample_df) ## get gbk file gbk = this_sample_df.loc[ this_sample_df['ext']=='gbf','sample'].values[0] ## debug if options.debug: print ("** DEBUG: this_sample_df") print (this_sample_df) print ('gbk:' + gbk) ## check if exists if HCGB_files.is_non_zero_file(gbk): print('\t+ Genbank file format reference available.') reference_gbk_file = gbk else: print(colored('\n** ERROR: No genbank file available for the reference specified. Some error occurred while downloading', 'red')) exit() #################### ## user_ref #################### elif options.user_ref: options.user_ref = os.path.abspath(options.user_ref) if HCGB_files.is_non_zero_file(options.user_ref): print('\t+ Reference provided via --user_ref is available and ready to use.') else: print('\n** ERROR: Reference provided via --user_ref not available or accessible.') print(colored('\n+ Check the path or integrity of the file. Some error occurred...', 'red')) exit() reference_gbk_file = options.user_ref return (reference_gbk_file)
def run_phylo(options): """ Main function acting as an entry point to the module *phylo*. """ ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_project): ## information for project help_info.project_help() exit() ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Phylogenetic reconstruction") print ("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir="" ## set mode: project/detached ## Project mode as default project_mode=True if (options.detached): options.project = False project_mode=False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get the database options.database = os.path.abspath(options.database) ### parse the reference print ("+ Retrieve the reference...") reference_gbk_file = get_reference_gbk(options) ## generate output folder, if necessary print ("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ################################## ## select samples and map #################################### print ("+ Retrieve samples to map available...") dict_folders = map_samples(options, reference_gbk_file, input_dir, outdir) if Debug: print (colored("**DEBUG: dict_folders **", 'yellow')) print (dict_folders) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ################################## ## Create core alingment ################################## outdir_report = HCGB_files.create_subfolder("report", outdir) phylo_dir = HCGB_files.create_subfolder("phylo", outdir_report) analysis_dir = HCGB_files.create_subfolder(options.name, phylo_dir) snippy_dir = HCGB_files.create_subfolder("snippy", analysis_dir) list_folders = list(dict_folders.values()) options_string = "" variant_calling.snippy_core_call(list_folders, options_string, options.name, snippy_dir, options.output_format, Debug) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## snp distance matrix snp_distance_dir = HCGB_files.create_subfolder("snp_distance", analysis_dir) name_matrix = os.path.join(snp_distance_dir, "snp_matrix_" + options.name) countGaps = False aln_file = os.path.join(snippy_dir, options.name + '.aln') phylo_parser.get_snp_distance(aln_file, options.output_format, countGaps, name_matrix, Debug) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## phylogenetic analysis iqtree_output = HCGB_files.create_subfolder("iqtree", analysis_dir) phylo_parser.ml_tree(snippy_dir, options.name, options.threads, iqtree_output, Debug) ## time stamp start_time_partial = HCGB_files.timestamp(start_time_total) print ("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print ("+ Exiting Annotation module.") return()
def mapReads_module(options, pd_samples_retrieved, outdir_dict, Debug, max_workers_int, threads_job, start_time_partial, outdir): # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["new_name"]) ## options STAR_exe = set_config.get_exe("STAR", Debug=Debug) cwd_folder = os.path.abspath("./") folder = files_functions.create_subfolder('STAR_files', cwd_folder) ## For many samples it will have to load genome index in memory every time. ## For a unique sample it will not matter. Take care genome might stay in memory. ## Use before loop option LoadAndExit and then: ## in loop ## Use option LoadAndKeep, set shared memory > 30 Gb ## when finished loop Remove memory ## check reference if (options.fasta): print("+ Genome fasta file provided") print("+ Create genomeDir for later usage...") options.fasta = os.path.abspath(options.fasta) ## create genomeDir options.genomeDir = mapReads.create_genomeDir(folder, STAR_exe, options.threads, options.fasta, options.limitRAM) elif (options.genomeDir): print("+ genomeDir provided.") options.genomeDir = os.path.abspath(options.genomeDir) ## remove previous reference genome from memory print("+ Remove genome in memory from previous call... (if any)") mapReads.remove_Genome(STAR_exe, options.genomeDir, folder, options.threads) ## load reference genome mapReads.load_Genome(folder, STAR_exe, options.genomeDir, options.threads) ## functions.time_functions.timestamp start_time_partial = time_functions.timestamp(start_time_partial) print("+ Mapping sequencing reads for each sample retrieved...") ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(mapReads_caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, STAR_exe, options.genomeDir, options.limitRAM, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("\n\n+ Mapping reads has finished...") ## functions.time_functions.timestamp start_time_partial = time_functions.timestamp(start_time_partial) ## remove reference genome from memory mapReads.remove_Genome(STAR_exe, options.genomeDir, folder, options.threads) ## functions.time_functions.timestamp start_time_partial = time_functions.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report using MultiQC module.") outdir_report = files_functions.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") map_report = files_functions.create_subfolder("STAR", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "STAR", map_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % map_report) return (start_time_partial)
def discardPlasmids(contigs, plasmids, path, sample): ## check if any plasmids if (plasmids == 'FAIL'): #print ('+ No plasmids assembled.') #print ('+ No need to discard any plasmids from the main assembly') contig_out_file = os.path.dirname(path) + '/' + sample + '/' + sample + '_chromosome.fna.tmp' shutil.copy(contigs, contig_out_file) return (contig_out_file, plasmids) ## discard print ('+ Check if any plasmids are also reported in main assembly...') folder = HCGB_files.create_subfolder('blast_search', path) ## makeblastDB dbName = folder + '/mainAssembly' HCGB_.makeblastdb(dbName, contigs) ## blastn command outFile = folder + '/blastn_output.txt' threads = 1 HCGB_blast.blastn(outFile, dbName, plasmids, threads) ######################## ## parseBlast results ######################## ## thresholds eval_thresh_float = float(1e-20) aln_thresh_given = 90 min_length = 1000 outFile_parsed = folder + '/blastn_output_parsed.txt' output_file = open(outFile_parsed, 'w') sequences2discard = [] print ('+ Parsing BLAST results generated...\n') ## get results fh = open(outFile) for blast_record in HCGB_blast.parse(fh, eval_thresh=eval_thresh_float, aln_thresh=aln_thresh_given, length_thresh=min_length): for hit in blast_record.hits: for hsp in hit: output_file.write('****Alignment****') output_file.write('\n') output_file.write('query id: {}'.format(blast_record.qid)) output_file.write('\n') sequences2discard.append(hsp.sid) output_file.write('sequence: %s' %hsp.sid) output_file.write('\n') output_file.write('e value: %s' %hsp.evalue) output_file.write('\n') output_file.write('aln: %s' %hsp.length) output_file.write('\n') output_file.write('qlen: %s [>%s]' %(hsp.qlen, min_length)) output_file.write('\n') aln = (int(hsp.qlen)/int(hsp.slen))*100 output_file.write('aln/slen: %s [> %s]' %(aln, aln_thresh_given)) output_file.write('\n\n') fh.close() output_file.close() items = len(sequences2discard) print ('There are %s sequences to discard from main assembly identified as plasmids' %items) ## print filtered contigs contig_out_file = os.path.dirname(path) + '/' + sample + '/' + sample + '_chromosome.fna.tmp' plasmid_out_file = os.path.dirname(path) + '/' + sample + '/' + sample + '_plasmid.fna.tmp' contig_out_file_handle = open(contig_out_file, 'w') plasmid_out_file_handle = open(plasmid_out_file, 'w') contig_items = SeqIO.parse(contigs, 'fasta') for seq in contig_items: if seq.id in sequences2discard: plasmid_out_file_handle.write(seq.format("fasta")) plasmid_out_file_handle.write('\n') else: contig_out_file_handle.write(seq.format("fasta")) contig_out_file_handle.write('\n') contig_out_file_handle.close() plasmid_out_file_handle.close() return (contig_out_file, plasmid_out_file)
def parse_options(arg_dict): outdir = os.path.abspath(arg_dict.output_folder) ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options ## ATTENTION: df_accID merge generated dataframe ## --------------------------------------- ## ## GFF or GBF file ## --------------------------------------- ## if (arg_dict.annot_file): arg_dict.annot_file = os.path.abspath(arg_dict.annot_file) # *************************** ## ## multiple files provided # *************************** ## if (arg_dict.batch): ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Multiple annotation file provided option:', 'yellow') debug_message('arg_dict.annot_file: ' + arg_dict.annot_file, 'yellow') ## check if ok BacDup_functions.file_readable_check(arg_dict.annot_file) print( colored('\t* Multiple annotation files provided .......[OK]', 'green')) dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',') ## debug messages if (arg_dict.debug): debug_message('dict_entries: ', 'yellow') debug_message(dict_entries, 'yellow') debug_message('+++++++++++++++++++++++++++++++\n\n') # *************************** ## ## single file provided # *************************** ## else: dict_entries = {} print(colored('\t* Annotation file:.......[OK]', 'green')) if (arg_dict.sample_name): sample_name = arg_dict.sample_name else: sample_name = "sample" ## dict_entries[sample_name] = arg_dict.annot_file ## create dataframe df_accID to match other formats df_accID = pd.DataFrame( columns=(BacDup_functions.columns_accID_table())) for name, file_annot in dict_entries.items(): file_annot = os.path.abspath(file_annot) ## init all genome = "" prot = "" gff = "" gbk = "" plasmid_count = "" plasmid_id = "" ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message( 'dict_entries check annotation files provided option:', 'yellow') debug_message('name: ' + name, 'yellow') debug_message('file_annot: ' + file_annot, 'yellow') ## check file is valid BacDup_functions.file_readable_check(file_annot) ## get format format = format_checker.is_format(file_annot, arg_dict.debug) if (arg_dict.debug): debug_message('format: ' + format, 'yellow') ## parse accordingly taxonomy = "" organism = "" taxonomy_string = "" genus = "" if (format == 'gbk'): ## get information from each sample (taxonomy, organism) = BacDup.scripts.functions.get_gbk_information( file_annot, arg_dict.debug) ## plasmid_count, plasmid_id not available elif (format == 'gff'): if (arg_dict.ref_file): arg_dict.ref_file = os.path.abspath(arg_dict.ref_file) BacDup_functions.file_readable_check(arg_dict.ref_file) if (arg_dict.batch): ref_entries = HCGB_main.file2dictionary( arg_dict.ref_file, ',') genome = ref_entries[name] else: genome = arg_dict.ref_file ## save into dataframe if len(taxonomy) > 1: genus = taxonomy[-1] taxonomy_string = ";".join(taxonomy) dir_path = os.path.abspath(os.path.dirname(file_annot)) df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism, taxonomy_string, genome, file_annot, format, prot, plasmid_count, ";".join(plasmid_id)) ## --------------------------------------- ## ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1 ## --------------------------------------- ## elif (arg_dict.GenBank_id): ## get database path if (arg_dict.db_folder): db_folder = HCGB_files.create_folder( os.path.abspath(arg_dict.db_folder)) else: db_folder = HCGB_files.create_subfolder( "db", os.path.abspath(arg_dict.output_folder)) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('GenBank ID option:', 'yellow') debug_message('db_folder: ' + db_folder, 'yellow') # *************************** ## ## batch file # *************************** ## if (arg_dict.batch): arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id) ## debug messages if (arg_dict.debug): debug_message('GenBank ID batch file provided:', 'yellow') debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id, 'yellow') ## check is a file and readable BacDup_functions.file_readable_check(arg_dict.GenBank_id) print( colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]', 'green')) print() ## call IDs into a list and create tmp folder strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id) strains2get = list(filter(None, strains2get)) ## debug messages if (arg_dict.debug): debug_message('strains2get: ' + str(strains2get), 'yellow') ## call NCBI_downloader df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list( strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level) # *************************** ## ## single GenBank ID # *************************** ## else: ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Single NCBI GenBank IDs provided option:', 'yellow') debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id, 'yellow') debug_message('db_folder: ' + db_folder, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## download print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green')) print() HCGB_aes.print_sepLine("+", 75, False) df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload( arg_dict.GenBank_id, db_folder, arg_dict.debug) ## --------------------------------------- ## ## NCBI Taxonomy ID: ## --------------------------------------- ## elif (arg_dict.tax_id): ################# ## get tax ids ################# if (arg_dict.batch): print( colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]', 'green')) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Multiple NCBI Taxonomy IDs provided option:', 'yellow') ## check is a file and readable BacDup_functions.file_readable_check(arg_dict.tax_id) ## get IDs into a list taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id) else: print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green')) taxIDs2get = [arg_dict.tax_id] print() ################################## ## init ete NCBI taxonomy database ################################## print('+ Initiate NCBI taxonomy database...') ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug) string_info_total = [] for taxid in taxIDs2get: ## parse info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel', arg_dict.debug) print() ## debug messages if arg_dict.debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) debug_message('info\n', "yellow") print(info) ## append if more string_info_total.extend(info) ## convert to list of strings string_info_total = [str(int) for int in string_info_total] ## assume all belong to same superkingdom if children of same tax_id group_obtained = taxonomy_retrieval.get_superKingdom( string_info_total[0], ncbi, arg_dict.debug) ################# ## get database path ################# if (arg_dict.db_folder): db_folder = HCGB_files.create_folder( os.path.abspath(arg_dict.db_folder)) else: db_folder = HCGB_files.create_subfolder("db", outdir) ## debug messages if arg_dict.debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) debug_message('group_obtained: ' + group_obtained, "yellow") debug_message('db_folder: ' + db_folder, "yellow") debug_message( 'arg_dict.assembly_level: ' + arg_dict.assembly_level, "yellow") debug_message('arg_dict.section: ' + arg_dict.section, "yellow") ################################## ## get GenBank entries selected ################################## (strains2get, allstrains_available) = taxonomy_retrieval.get_GenBank_ids( db_folder, string_info_total, int(arg_dict.k_random), arg_dict.debug, assembly_level_given=arg_dict.assembly_level, group_given=group_obtained, section_given=arg_dict.section) ## print list and dictionary of possible and selected taxIDs outdir = os.path.abspath(arg_dict.output_folder) info_dir = HCGB_files.create_subfolder("info", outdir) input_info_dir = HCGB_files.create_subfolder("input", info_dir) HCGB_main.printList2file( os.path.join(input_info_dir, 'Downloaded.txt'), strains2get) HCGB_main.printList2file( os.path.join(input_info_dir, 'all_entries.txt'), allstrains_available) ## save into file file_info = os.path.join(input_info_dir, 'info.txt') ## stop here if dry_run if arg_dict.dry_run: print() HCGB_aes.print_sepLine("*", 75, False) print( "ATTENTION: Dry run mode selected. Stopping the process here.") HCGB_aes.print_sepLine("*", 75, False) print("+ All available entries listed and printed in file:\n\t" + os.path.join(input_info_dir, 'all_entries.txt')) print("+ Subset of entries generated and printed in file:\n\t" + os.path.join(input_info_dir, 'Downloaded.txt')) print( "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n" ) HCGB_aes.print_sepLine("*", 75, False) print() exit() ################# ## call NCBI_downloader ################# df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list( strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level) ## --------------------------------------- ## ## Previous BacDup analysis folder ## --------------------------------------- ## ## TODO elif (arg_dict.project): print( colored( '\t* A previous BacDup analysis project folder:.......[OK]', 'green')) ## create df_accID to store data ## TODO ## Returns dataframe with information df_accID = df_accID.set_index('new_name') return (df_accID)
def run_annotation(options): ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() elif (options.help_Prokka): ## information for Prokka annotation.print_list_prokka() exit() ## set default options.batch = False ### HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly annotation") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ### symbolic links print("+ Retrieve all genomes assembled...") ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for samples outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "annot", options.debug) ## annotate print("+ Annotate assemblies using prokka:") print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode") if options.genera == 'Other': print( "\t-Option: genera = Off; No genus-specific BLAST databases option provided" ) else: print("\t-Option: genera = ", options.genera, "; Genus-specific BLAST databases option provided") print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature") print("\t-Option: addmrna; Add 'mRNA' features for each 'CDS' feature") print("\t-Option: cdsrnaolap; Allow [tr]RNA to overlap CDS") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(annot_caller, row['sample'], outdir_dict[row['name']], options, row['name'], threads_job): index for index, row in pd_samples_retrieved.iterrows() } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## get folders givenList = [v for v in outdir_dict.values()] protein_files = [] print( "+ Detail information for each sample could be identified in separate folders:" ) for folder in givenList: print('\t + ', folder) protein_files.extend( HCGB_main.retrieve_matching_files(folder, '.faa', Debug)) ### report generation if (options.skip_report): print("+ No annotation report generation...") else: ### report generation HCGB_aes.boxymcboxface("Annotation report") outdir_report = HCGB_files.create_subfolder("report", outdir) PROKKA_report = HCGB_files.create_subfolder("annotation", outdir_report) print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## check if previously report generated filename_stamp = PROKKA_report + '/.success' done = 0 if os.path.isdir(PROKKA_report): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous report generated results on: %s" % stamp, 'yellow')) done = 1 ## generate report if done == 0: ## get subdirs generated and call multiQC report module multiQC_report.multiQC_module_call(givenList, "Prokka", PROKKA_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## success stamps filename_stamp = PROKKA_report + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) ## time stamp start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total) ## Check each annotation using BUSCO results = qc.BUSCO_check(input_dir, outdir, options, start_time_partial_BUSCO, "proteins") ## print to file: results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Annotation module.") return ()
def getdbs(source, database_folder, option, debug): """Get databases available within the folder provided. :param source: Type of database to search: ARIBA, KMA, NCBI, MLST, user_data :param database_folder: Absolute path to database folder. :param option: String containing multiple entries separated by '#' that indicate the type of database entries to search within each source type. :param debug: True/False for debugging messages. :type source: string :type database_folder: string :type option: string :type debug: bool :returns: Dataframe containing absolute paths to the available databases for each type requested. It contains columns for: "source", "db", "path" e.g.: source = KMA option = kma:archaea,plasmids,bacteria#kma_external:/path/to/file1,/path/to/file2#user_data#genbank ** e.g.: source = NCBI option = genbank """ ## init dataframe colname = ["source", "db", "path"] db_Dataframe = pd.DataFrame(columns=colname) ## read folders within database if os.path.isdir(database_folder): files = os.listdir(database_folder) ## ARIBA/KMA_db/genbank/user_data else: return db_Dataframe ## debug message if (debug): print(colored("Folders: " + str(files), 'yellow')) print() ## user input dbs2use = [] option_list = option.split("#") for option_item in option_list: ## debug message if (debug): print(colored("Option item: " + option_item, 'yellow')) ### dbs2use_tmp = [] ## kma if (option_item.startswith('kma')): if (option_item.startswith('kma:')): dbs2use_tmp = option_item.split(":")[1].split(",") elif (option_item.startswith('kma_external:')): external = option_item.split(":")[1].split(",") ## add to dataframe for ext in external: name_ext = os.path.basename(ext) db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_External', name_ext, ext ] elif (option_item.startswith('kma_user_data:')): dbs2use_tmp = option_item.split(":")[1].split(",") elif (option_item.startswith('kma_NCBI:')): dbs2use_tmp = option_item.split(":")[1].split(",") ### ARIBA elif (option_item.startswith('ARIBA:')): dbs2use = option_item.split(":")[1].split(",") ### NCBI: genbank elif (option_item.startswith('genbank')): dbs2use.append('genbank') ### NCBI: taxonomy ID elif (option_item.startswith('tax_id')): dbs2use.append('taxonomy_id') ### user_data elif (option_item.startswith('user_data')): dbs2use.append('user_data') ### MLST elif (option_item.startswith('MLST')): dbs2use_tmp = option_item.split(":")[1].split(",") ### Mash elif (option_item.startswith('Mash')): if (option_item.startswith('Mash_external_data:')): external = option_item.split(":")[1].split(",") ## add to dataframe for ext in external: name_ext = os.path.basename(ext) name_ext_ = name_ext.split('.fna')[0] db_Dataframe.loc[len(db_Dataframe)] = [ 'Mash_external', name_ext_, ext ] else: dbs2use_tmp = option_item.split(":")[1].split(",") ### Other? else: dbs2use.append( option_item ) ## add ARIBA, user_data or genbank option if provided ## get all dbs2use = dbs2use + dbs2use_tmp ## debug message if (debug): print(colored("\ndbs2use:\n\t" + "\n\t".join(dbs2use), 'yellow')) ## init dataframe #colname = ["source", "db", "path"] #db_Dataframe = pd.DataFrame(columns = colname) ############### #### ARIBA #### ############### if (source == 'ARIBA'): ### Check if folder exists ARIBA_folder = HCGB_files.create_subfolder('ARIBA', database_folder) ### get information ARIBA_dbs = ariba_caller.get_ARIBA_dbs(dbs2use) ## get names for ariba_db in ARIBA_dbs: this_db = os.path.join(ARIBA_folder, ariba_db + '_prepareref') if os.path.exists(this_db): code_check_db = ariba_caller.check_db_indexed(this_db, 'NO') if (code_check_db == True): db_Dataframe.loc[len(db_Dataframe)] = [ 'ARIBA', ariba_db, this_db ] print( colored( "\t- ARIBA: including information from database: " + ariba_db, 'green')) else: print("+ Database: ", ariba_db, " is not downloaded...") print("+ Download now:") folder_db = HCGB_files.create_subfolder(ariba_db, ARIBA_folder) code_db = ariba_caller.ariba_getref(ariba_db, folder_db, debug, 2) ## get names if (code_db == 'OK'): db_Dataframe.loc[len(db_Dataframe)] = [ 'ARIBA', ariba_db, this_db ] print( colored( "\t- ARIBA: including information from database: " + ariba_db, 'green')) ############# #### KMA #### ############# elif (source == 'KMA'): ### Check if folder exists KMA_db_abs = HCGB_files.create_subfolder('KMA_db', database_folder) kma_dbs = os.listdir(KMA_db_abs) ## debug message if (debug): print(colored("Folders KMA_db:" + str(kma_dbs), 'yellow')) ### get information for db in dbs2use: this_db = KMA_db_abs + '/' + db ## debug message if (debug): print(colored("this_db:" + this_db, 'yellow')) #### genbank if (db == "genbank"): ## KMA databases exists this_db_file = this_db + '/genbank_KMA' if os.path.isfile(this_db_file + '.comp.b'): print( colored( "\t- genbank: including information from different reference strains available.", 'green')) ## include data from NCBI db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_genbank', 'genbank', this_db_file ] #### user_data elif (db == "user_data"): ## KMA databases exists this_db_file = this_db + '/userData_KMA' if os.path.isfile(this_db_file + '.comp.b'): print( colored( "\t- user_data: including information from user previously generated results", 'green')) ## include user data db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_user_data', 'user_data', this_db_file ] ## default KMA databases: bacteria & plasmids else: ## if (db == 'plasmids'): prefix = '.T' elif (db == 'viral'): prefix = '.TG' else: prefix = '.ATG' this_db_file = os.path.join(this_db, db, db + prefix) ## debug message if (debug): print(colored("this_db_file:" + this_db_file, 'yellow')) if os.path.isfile(this_db_file + '.comp.b'): db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_db', db, this_db_file ] print( colored( "\t- KMA: including information from database " + db, 'green')) else: print( colored("\t**KMA: Database %s was not available." % db, 'red')) ## if missing: call download module print("+ Download missing KMA_db (%s) provided" % db) species_identification_KMA.download_kma_database( os.path.join(database_folder, 'KMA_db', db), db, debug) if os.path.isfile(this_db_file + '.comp.b'): db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_db', db, this_db_file ] print( colored( "\t- KMA: including information from database " + db, 'green')) else: print( colored( "\t**KMA: Database %s was not available." % db, 'red')) ############## #### NCBI #### ############## elif (source == 'NCBI'): ## TODO: get additional information from ## info_file = dir_path + '/info.txt' ### Check if folder exists path_genbank = os.path.join(database_folder, source, 'genbank') db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder) ### genbank entries downloaded if dbs2use[0] == 'genbank': ## if os.path.exists(path_genbank + '/bacteria'): genbank_entries = os.listdir( os.path.join(path_genbank, 'bacteria')) for entry in genbank_entries: this_db = os.path.join(path_genbank, 'bacteria', entry) db_Dataframe.loc[len(db_Dataframe)] = [ 'NCBI:genbank', entry, this_db ] elif dbs2use[0] == 'tax_id': tax_id_entries = db2use_abs ################### #### user_data #### ################### elif (source == 'user_data'): ### Check if folder exists db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder) user_entries = os.listdir(db2use_abs) for entry in user_entries: this_db = db2use_abs + '/' + entry db_Dataframe.loc[len(db_Dataframe)] = ['user_data', entry, this_db] ################# #### PubMLST #### ################# elif (source == 'MLST'): ### get information for db in dbs2use: if db == 'PubMLST': ### Check if folder exists db2use_abs = HCGB_files.create_subfolder( 'PubMLST', database_folder) list_profiles = os.listdir(db2use_abs) for entry in list_profiles: this_db = db2use_abs + '/' + entry db_Dataframe.loc[len(db_Dataframe)] = [ 'MLST', 'PubMLST', entry + ',' + this_db ] print( colored( "\t- MLST: including information from profile: " + entry, 'green')) else: db_Dataframe.loc[len(db_Dataframe)] = [ 'MLST', 'user_profile', db ] print( colored( "\t- MLST: including information from profile provided by user: "******"genbank"): ### Check if folder exists db2use_abs = database_folder + '/NCBI/genbank/bacteria' if os.path.exists(db2use_abs): print( colored( "\n\t- genbank: including information from different reference strains available.", 'green')) ## include data from NCBI genbank_entries = os.listdir(db2use_abs) for entry in genbank_entries: print('\t+ Reading information from sample: ', entry) this_db = db2use_abs + '/' + entry ## get additional information from info_file = this_db + '/info.txt' info_data = pd.read_csv(info_file).set_index('ID') info_data.fillna("NaN", inplace=True) ## get readable name for each strain entry_strain = str(info_data.loc[entry]['name']) if entry_strain == 'NaN': ## TODO: debug if it works entry_strain = entry print() else: print('\t\t+ Rename into: ', entry_strain) list_msh = HCGB_main.retrieve_matching_files( this_db, '.sig', debug) if (list_msh): ## print original in file file2print = this_db + '/.original' if not os.path.exists(file2print): original = ['NaN'] else: original = HCGB_main.readList_fromFile( file2print) db_Dataframe.loc[len(db_Dataframe)] = [ 'genbank', entry_strain, list_msh[0], this_db + '/mash/' + original[0], original[1], original[2], this_db ] else: ## index assembly or reads... list_fna = HCGB_main.retrieve_matching_files( this_db, 'genomic.fna', debug) ## not available db_Dataframe.loc[len(db_Dataframe)] = [ 'genbank', entry_strain, 'NaN', list_fna[0], 'NaN', 'NaN', this_db ] #### user_data elif (db == "user_data"): print( colored( "\n\t- user_data: including information from user previously generated results", 'green')) ## include user data db2use_abs = HCGB_files.create_subfolder( 'user_data', database_folder) user_entries = os.listdir(db2use_abs) for entry in user_entries: if entry == 'user_database.csv': continue print('\t+ Reading information from sample: ', entry) this_db = db2use_abs + '/' + entry this_mash_db = this_db + '/mash/' + entry + '.sig' if os.path.exists(this_mash_db): ## print original in file file2print = this_db + '/mash/.original' if not os.path.exists(file2print): original = ['NaN', 'NaN', 'NaN'] else: original = HCGB_main.readList_fromFile(file2print) ## db_Dataframe.loc[len(db_Dataframe)] = [ 'user_data', entry, this_mash_db, this_db + '/mash/' + original[0], original[1], original[2], this_db + '/mash' ] else: ## not available list_fna = HCGB_main.retrieve_matching_files( this_db + '/assembly', '.fna', debug) db_Dataframe.loc[len(db_Dataframe)] = [ 'user_data', entry, 'NaN', list_fna[0], 'NaN', 'NaN', this_db + '/mash' ] #### external_data ### TODO: Fix this mash_bin = "" #set_config.get_exe('mash') if any(name in 'Mash_external' for name in db_Dataframe['source'].to_list()): print( colored( "\t- external_data: including information from external data provided by user", 'green')) ## include user data db_Dataframe = db_Dataframe.set_index("db", drop=False) frame = db_Dataframe[db_Dataframe['source'] == 'Mash_external'] for index, row in frame.iterrows(): print('\t+ Reading information for file: ', row['db']) outfile = row['path'] + '.msh' if not os.path.exists(outfile): path_file = os.path.dirname(row['path']) this_db_file = min_hash_caller.sketch_database([row['path']], mash_bin, row['path'], row['db'], path_file) HCGB_aes.print_sepLine("*", 50, False) db_Dataframe.loc[row['db']] = [ 'Mash_external', row['db'], outfile, row['path'] ] ## index by id db_Dataframe = db_Dataframe.set_index("db", drop=False) return (db_Dataframe)
def send_kma_job(outdir_file, list_files, name, database, threads, Debug): """ Executes KMA identification jobs This function automates the process of checking if any previous run succeeded or runs the appropiate identification process for the sample and database provided. :param outdir_file: :param list_files: :param name: :param database: :param threads: :param dataFrame_sample: :type outdir_file: :type list_files: :type name: :type database: :type threads: :type dataFrame_sample: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.config.set_config.get_exe` - :func:`BacterialTyper.scripts.species_identification_KMA.kma_ident_call` - :func:`BacterialTyper.module.ident.get_outfile` - :func:`BacterialTyper.scripts.functions.read_time_stamp` """ if (Debug): print(colored("**DEBUG: ident.send_kma_job call**", 'yellow')) print("outdir_file") print(outdir_file) print("list_files") print(list_files) print("name: " + name) print("database: " + database) ## outdir_KMA outdir_dict_kma = HCGB_files.create_subfolder("kma", outdir_file) ## set defaults kma_bin = set_config.get_exe("kma") ## get outfile outfile = get_outfile(outdir_dict_kma, name, database) ## check if previously run and succeeded basename_tag = os.path.basename(outfile) filename_stamp = outdir_dict_kma + '/.success_' + basename_tag if (Debug): print("Outdir: ", outdir_dict_kma) print("outfile: ", outfile) print("Filename_stamp: ", filename_stamp) if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) else: ## debug message if (Debug): print( colored( "**DEBUG: species_identification_KMA.kma_ident_module call**", 'yellow')) print("outfile = get_outfile(outdir_dict_kma, name, db2use)") print("outfile: ", outfile) print( "species_identification_KMA.kma_ident_module(outfile, list_files, name, database, threads) " ) print("species_identification_KMA.kma_ident_module" + "\t" + outfile + "\t" + str(list_files) + "\t" + name + "\t" + database + "\t" + str(threads) + "\n") ## Sparse or not #if any(name in basename_tag for name in ['userData_KMA', 'genbank_KMA']): # if (basename_tag == 'userData_KMA'): # option = '' # else: # option = '-Sparse ' ## Add option to retrieve databse from memory option = "" option = option + '-shm 1' # Call KMA species_identification_KMA.kma_ident_call(outfile, list_files, name, database, kma_bin, option, threads) stamp = HCGB_time.print_time_stamp(filename_stamp)
def run_ident(options): """ Main function acting as an entry point to the module *ident*. Arguments: .. seealso:: Additional information to PubMLST available datasets. - :doc:`PubMLST datasets<../../../data/PubMLST_datasets>` """ ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_KMA): ## information for KMA Software species_identification_KMA.help_kma_database() exit() elif (options.help_MLSTar): ## information for KMA Software MLSTar.help_MLSTar() exit() ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ### species_identification_KMA -> most similar taxa HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Species identification") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default global Project if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) Project = False else: options.project = True outdir = input_dir Project = True ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for each sample outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "ident", options.debug) ## let's start the process print( "+ Generate an species typification for each sample retrieved using:") print("(1) Kmer alignment (KMA) software.") print("(2) Pre-defined databases by KMA or user-defined databases.") ## get databases to check retrieve_databases = get_options_db(options) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## debug message if (Debug): print(colored("**DEBUG: retrieve_database **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print(retrieve_databases) ######## KMA identification dataFrame_kma = KMA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, start_time_partial) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## debug message if (Debug): print(colored("**DEBUG: retrieve results to summarize **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print("dataframe_kma") print(dataFrame_kma) ## exit if viral search skip = False if (len(options.kma_dbs) == 1): for i in options.kma_dbs: if (i == 'viral'): print() MLST_results = '' options.fast = True skip = True ## what if only plasmids? ## do edirect and MLST if bacteria if (not skip): dataFrame_edirect = pd.DataFrame() ######## EDirect identification #dataFrame_edirect = edirect_ident(dataFrame_kma, outdir_dict, Debug) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## debug message if (Debug): print(colored("**DEBUG: retrieve results from NCBI **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print("dataFrame_edirect") print(dataFrame_edirect) ######## MLST identification MLST_results = MLST_ident(options, dataFrame_kma, outdir_dict, dataFrame_edirect, retrieve_databases) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## debug message if (Debug): print( colored("**DEBUG: retrieve results to summarize **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print("MLST_results") print(MLST_results) ## generate summary for sample: all databases ## MLST, plasmids, genome, etc HCGB_aes.boxymcboxface("Results Summary") ##################################### ## Summary identification results ## ##################################### ## parse results if options.project: final_dir = os.path.join(outdir, 'report', 'ident') HCGB_files.create_folder(final_dir) else: final_dir = outdir ### excel_folder = HCGB_files.create_subfolder("samples", final_dir) print('+ Print summary results in folder: ', final_dir) print('+ Print sample results in folder: ', excel_folder) # Group dataframe results summary by sample name sample_results_summary = dataFrame_kma.groupby(["Sample"]) ## debug message if (Debug): print(colored("**DEBUG: sample_results_summary **", 'yellow')) print(sample_results_summary) ## results_summary_KMA = pd.DataFrame() MLST_all = pd.DataFrame() for name, grouped in sample_results_summary: ## create a excel and txt for sample name_sample_excel = excel_folder + '/' + name + '_ident.xlsx' name_sample_csv = outdir_dict[ name] + '/ident_summary.csv' ## check in detached mode writer_sample = pd.ExcelWriter( name_sample_excel, engine='xlsxwriter') ## open excel handle ## subset dataframe & print result results_summary_toPrint_sample = grouped[[ 'Sample', '#Template', 'Query_Coverage', 'Template_Coverage', 'Depth', 'Database' ]] results_summary_toPrint_sample.to_excel( writer_sample, sheet_name="KMA") ## write excel handle results_summary_toPrint_sample.to_csv( name_sample_csv) ## write csv for sample ## read MLST if MLST_results: if name in MLST_results: sample_MLST = pd.read_csv(MLST_results[name], header=0, sep=',') sample_MLST['genus'] = dataFrame_edirect.loc[ dataFrame_edirect['sample'] == name, 'genus'].values[0] sample_MLST['species'] = dataFrame_edirect.loc[ dataFrame_edirect['sample'] == name, 'species'].values[0] sample_MLST.to_excel(writer_sample, sheet_name="MLST") ## write excel handle ## Return information to excel MLST_all = pd.concat([MLST_all, sample_MLST]) ## close excel handle writer_sample.save() ## name_excel = final_dir + '/identification_summary.xlsx' print('+ Summary information in excel file: ', name_excel) writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') ## open excel handle ## KMA dataframe: print result for sources results_summary_KMA = dataFrame_kma[[ 'Sample', '#Template', 'Query_Coverage', 'Template_Coverage', 'Depth', 'Database' ]] ## Sum plasmid and chromosome statistics ## ## sum coverage total_coverage = results_summary_KMA.groupby( 'Sample')['Query_Coverage'].sum().reset_index() ## debug message if (Debug): print("*** Sum: Query_coverage ***") print(total_coverage) ## TODO: FIX SUMMARY REPORT results_summary_KMA = results_summary_KMA.set_index('Sample') results_summary_KMA = results_summary_KMA.sort_values( by=['Sample', 'Database', 'Query_Coverage'], ascending=[True, True, True]) results_summary_KMA.to_excel(writer, sheet_name='KMA') ## write excel handle ## write MLST if (MLST_results): MLST_all.to_excel(writer, sheet_name='MLST') ## write excel and close writer.save() ## close excel handle print("\n+ Check summary of results in file generated") ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ###################################### ## update database for later usage ###################################### if not options.fast: HCGB_aes.boxymcboxface("Update Sample Database") ## update db print("+ Update database with samples identified") ## debug message if (Debug): print(colored("**DEBUG: dataFrame_edirect **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print(dataFrame_edirect) ## dataFrame_edirect file_toprint = final_dir + '/edirect_info2download.csv' dataFrame_edirect.to_csv(file_toprint) ## update database with samples identified data2download = dataFrame_edirect.filter( ['genus', 'species', 'strain', 'genome']) data2download = data2download.rename(columns={ 'genome': 'NCBI_assembly_ID', 'strain': 'name' }) NCBI_folder = os.path.abspath(options.database) + '/NCBI' database_generator.NCBI_DB(data2download, NCBI_folder, Debug) else: print( "+ No update of the database has been requested using option --fast" ) print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting identification module.") return ()
def KMA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, time_partial): """Kmer identification using software KMA_. :param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in... :param pd_samples_retrieved: pandas dataframe for samples to process. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :param retrieve_databases: :param time_partial: timestamp of start time of the process. :type options: :type pd_samples_retrieved: pandas.DataFrame() :type outdir_dict: Dictionary :type retrieve_databases: pandas.DataFrame() :type time_partial: :return: Information of the identification. See example below. :rtype: pandas.DataFrame() See example of returned dataframe in file :file:`/devel/results/KMA_ident_example.csv` here: .. include:: ../../devel/results/KMA_ident_example.csv :literal: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.config.set_config.get_exe` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.modules.ident.send_kma_job` - :func:`BacterialTyper.modules.ident.get_outfile` - :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed` - :func:`BacterialTyper.scripts.species_identification_KMA.parse_kma_results` .. include:: ../../links.inc """ return (pd.DataFrame()) ### print header HCGB_aes.boxymcboxface("KMA Identification") ## set defaults kma_bin = set_config.get_exe("kma") ## check status databases2use = [] for index, db2use in retrieve_databases.iterrows(): ## index_name if (str(db2use['source']).startswith('KMA')): print('+ Check database: ' + db2use['db']) fold_name = os.path.dirname(db2use['path']) index_status = species_identification_KMA.check_db_indexed( db2use['path'], fold_name) if (index_status == True): print( colored( "\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green')) databases2use.append(db2use['path']) else: #databases2use.remove(db2use) print( colored( "\t**Databases %s is not correctly indexed. Not using it...\n" % db2use['db'], 'red')) ## debug message if (Debug): print( colored( "**DEBUG: databases2use\n" + "\n".join(databases2use) + "\n**", 'yellow')) ## Start identification of samples print("\n+ Send KMA identification jobs...") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: for db2use in databases2use: ## load database on memory print("+ Loading database on memory for faster identification.") return_code_load = species_identification_KMA.load_db( kma_bin, db2use) ## send for each sample commandsSent = { executor.submit(send_kma_job, outdir_dict[name], sorted(cluster["sample"].tolist()), name, db2use, threads_job, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## remove database from memory print("+ Removing database from memory...") return_code_rm = species_identification_KMA.remove_db( kma_bin, db2use) if (return_code_rm == 'FAIL'): print( colored( "***ERROR: Removing database from memory failed. Please do it manually! Execute command: %s" % cmd_rm_db, 'red')) ## functions.timestamp time_partial = HCGB_time.timestamp(time_partial) ## parse results print("+ KMA identification call finished for all samples...") print("+ Parse results now") results_summary = pd.DataFrame() for db2use in databases2use: ### [TODO]: parse data according to database: bacteria, plasmids or user data or genbank data provided basename_db = os.path.basename(db2use) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) ### for name, cluster in sample_frame: ## get result ## outdir_KMA outdir_dict_kma = HCGB_files.create_subfolder( "kma", outdir_dict[name]) result = get_outfile(outdir_dict_kma, name, db2use) #print ('\t- File: ' + result + '.spa') ## get results using a cutoff value [Defaulta: 80] results = species_identification_KMA.parse_kma_results( result + '.spa', options.KMA_cutoff) results['Database'] = basename_db ### check if db2use is plasmids as it could be several. if (results.index.size > 1): if (basename_db == "plasmids.T" or basename_db == "viral.TG"): ## let it be several entries results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) else: print( colored("###########################################", 'yellow')) print( colored("Sample %s contains multiple strains." % name, 'yellow')) print( colored("###########################################", 'yellow')) print(colored(results, 'yellow')) print('\n\n') ## add both strains if detected results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) ## TODO: add multi-isolate flag elif (results.index.size == 1): ## 1 clear reference results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) else: print( colored( '\tNo clear strain from database %s has been assigned to sample %s' % (basename_db, name), 'yellow')) ## add empty line if no available results['Sample'] = name results_summary = results_summary.append(results, ignore_index=True) print("+ Finish this step...") ## debug message if (Debug): results_summary.to_csv(quotechar='"') return (results_summary)
def run_input(arg_dict): """Main function of the input_parser module in BacDup package. This module prepares data for later gene duplication analysis. It allows the user to provide either a single sample, multiple samples, NCBI GenBank IDs or NCBI taxonomy IDs to retrieve and obtain the annotation data. """ ## help message if (arg_dict.input_help): help_input() exit() BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Preparing input files") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out #input_dir = os.path.abspath(options.input) outdir = os.path.abspath(arg_dict.output_folder) ## output folder print("\n+ Create output folder(s):") HCGB_files.create_folder(outdir) ## set defaults if not (arg_dict.assembly_level): arg_dict.assembly_level = 'complete' if not (arg_dict.section): arg_dict.section = 'genbank' ## project or detached? if arg_dict.detached: arg_dict.project = False final_dir = outdir data_dir = outdir else: arg_dict.project = True print( "+ Generate a directory containing information within the project folder provided" ) final_dir = HCGB_files.create_subfolder("info", outdir) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('final_dir:' + final_dir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting input information provided... ') print('+ Several options available:') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\n\t* Single/Multiple NCBI GenBank IDs') print('\n\t* Single/Multiple NCBI taxonomy IDs + Options') print('\n\t* A previous BacDup project folder') print('\n+ Check the option provided...') time.sleep(1) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ################################################# ## Parse and obtain the type of input information provided ################################################# df_accID = parse_options(arg_dict) ## pd.DataFrame: 'new_name','folder','genus', ## 'species','taxonomy','genome', ## 'annot_file','format_annot_file', 'proteins', ## 'plasmids_number','plasmids_ID')) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## parse information accordingly parse_information(arg_dict, df_accID, outdir) ### report generation HCGB_aes.boxymcboxface("Summarizing input files") outdir_report = HCGB_files.create_subfolder("report", outdir) input_report = HCGB_files.create_subfolder("input", outdir_report) ## add df_accID.loc[sample,] information as csv into input folder df_accID.to_csv(os.path.join(input_report, 'info.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Input module.") return ()
def get_assembly_stats_all(assembly_stats_dict, outdir, debug): ## get all assembly stats outdir_report = HCGB_files.create_subfolder("report", outdir) final_dir = HCGB_files.create_subfolder("assembly_stats", outdir_report) final_sub_dir = HCGB_files.create_subfolder("samples", final_dir) #### summary and information results_summary_toPrint_all = pd.DataFrame() column_names = ("Type", "Sample", "Total Sequences", "GC% Content", "Longest sequence", "Shortest sequence", "Median length", "Mean length", "Total Length (bp)", "L10", "N10", "L20", "N20", "L30", "N30", "L40", "N40", "L50", "N50") ## debugging messages if debug: HCGB_aes.debug_message("Create assembly statistic for all samples") for sample_name in assembly_stats: excel_file_stats = assembly_stats[sample_name][1] if debug: HCGB_aes.debug_message("sample_name: " + sample_name, 'yellow') HCGB_aes.debug_message("excel: " + excel_file_stats, 'yellow') HCGB_aes.debug_message("contig stats dictionary: ", 'yellow') print(assembly_stats[sample_name][0]['Contig Stats']) HCGB_aes.debug_message("scaffold stats dictionary: ", 'yellow') print(assembly_stats[sample_name][0]['Scaffold Stats']) # get contig contig_stats = pd.DataFrame.from_dict( assembly_stats[sample_name][0]['Contig Stats'], orient='index').transpose() contig_stats['type'] = 'contigs' contig_stats['sample_name'] = sample_name # get scaffold scaff_stats = pd.DataFrame.from_dict( assembly_stats[sample_name][0]['Scaffold Stats'], orient='index').transpose() scaff_stats['type'] = 'scaffolds' scaff_stats['sample_name'] = sample_name ## copy individual excel file shutil.copy(excel_file_stats, final_sub_dir) ## add all data results_summary_toPrint_all = pd.concat( [results_summary_toPrint_all, contig_stats, scaff_stats], ignore_index=True) ## reorder columns cols = results_summary_toPrint_all.columns.tolist() cols = cols[-1:] + cols[:-1] cols = cols[-1:] + cols[:-1] results_summary_toPrint_all = results_summary_toPrint_all[cols] ## write to excel name_excel_summary = final_dir + '/summary_stats.xlsx' writer_summary = pd.ExcelWriter(name_excel_summary, engine='xlsxwriter') ## open excel handle ## filter important columns results_summary_toPrint_all = results_summary_toPrint_all.set_axis( column_names, 1) ## save in excel results_summary_toPrint_all.to_excel( writer_summary, sheet_name="all_data") ## write excel handle writer_summary.save() ## close excel handle
def run_biotype(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_XICRA.help_fastq_format() elif (options.help_project): ## information for project help_XICRA.project_help() exit() elif (options.help_RNAbiotype): ## information for join reads RNAbiotype.help_info() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True aesthetics_functions.pipeline_header('XICRA') aesthetics_functions.boxymcboxface("RNA biotype analysis") print("--------- Starting Process ---------") time_functions.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.detached): outdir = os.path.abspath(options.output_folder) options.project = False else: options.project = True outdir = input_dir ## get files print('+ Getting files from input folder... ') ## get files if options.noTrim: print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: print('+ Mode: trim.\n+ Extension: ') print("[ _trim_ ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## Discard if joined reads: use trimmed single-end or paired-end pd_samples_retrieved = pd_samples_retrieved[ pd_samples_retrieved['ext'] != '_joined'] ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: files_functions.create_folder(outdir) ## for samples mapping_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "map", options.debug) ## debug message if (Debug): print(colored("**DEBUG: mapping_outdir_dict **", 'yellow')) print(mapping_outdir_dict) # time stamp start_time_partial = time_functions.timestamp(start_time_total) ## optimize threads name_list = set(pd_samples_retrieved["new_name"].tolist()) threads_job = main_functions.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ############################################## ## map Reads ############################################## start_time_partial = mapReads_module(options, pd_samples_retrieved, mapping_outdir_dict, options.debug, max_workers_int, threads_job, start_time_partial, outdir) ## debug message if (Debug): print(colored("**DEBUG: mapping_results **", 'yellow')) print(mapping_results) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) ## for samples biotype_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "biotype", options.debug) ## debug message if (Debug): print(colored("**DEBUG: biotype_outdir_dict **", 'yellow')) print(biotype_outdir_dict) ## get RNAbiotype information RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict, options.annotation, options.debug, max_workers_int, threads_job) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print( "\n+ Generating a report using MultiQC module for featureCount analysis." ) outdir_report = files_functions.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in biotype_outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") featureCount_report = files_functions.create_subfolder( "featureCount", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "featureCount", featureCount_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % featureCount_report) ### Summarizing RNA biotype information biotype_report = files_functions.create_subfolder( "biotype", outdir_report) single_files_biotype = files_functions.create_subfolder( "samples", biotype_report) ## results dict_files = {} for samples in biotype_outdir_dict: featurecount_file = os.path.join(biotype_outdir_dict[samples], 'featureCount.out.tsv') if files_functions.is_non_zero_file(featurecount_file): dict_files[samples] = featurecount_file ## copy pdf pdf_plot = main_functions.retrieve_matching_files( biotype_outdir_dict[samples], '.pdf', options.debug) if files_functions.is_non_zero_file(pdf_plot[0]): shutil.copy(pdf_plot[0], single_files_biotype) ## collapse all information all_data = RNAbiotype.generate_matrix(dict_files) ## print into excel/csv print('+ Table contains: ', len(all_data), ' entries\n') ## debugging messages if Debug: print("** DEBUG: all_data") print(all_data) ## set abs_csv_outfile to be in report folder ## copy or link files for each sample analyzed abs_csv_outfile = os.path.join(biotype_report, "summary.csv") all_data.to_csv(abs_csv_outfile) ## create plot: call R [TODO: implement in python] outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf") ## R scripts biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum', options.debug) rscript = set_config.get_exe("Rscript", options.debug) cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script, abs_csv_outfile, outfile_pdf) ## print("+ Create summary plot for all samples") callCode = system_call_functions.system_call(cmd_R_plot) print("\n*************** Finish *******************") start_time_partial = time_functions.timestamp(start_time_total) print("\n+ Exiting join module.") return ()
def edirect_ident(dataFrame, outdir_dict, Debug): """Connect to NCBI for information retrieval This functions uses the software edirect_ to connect to NCBI and retrieve some information regarding samples, assemblies, publications, etc. :param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :type dataFrame: pandas.DataFrame() :type outdir_dict: Dictionary :return: Information of the identification :rtype: pandas.DataFrame() See example of returned dataframe in file :file:`/devel/results/edirect_download_results.csv` here: .. include:: ../../devel/results/edirect_download_results.csv :literal: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.get_info_file` - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.functions.print_time_stamp` - :func:`BacterialTyper.scripts.functions.optimize_threads` - :func:`BacterialTyper.scripts.functions.create_subfolder` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.scripts.functions.is_non_zero_file` - :func:`BacterialTyper.scripts.edirect_caller.generate_docsum_call` - :func:`BacterialTyper.scripts.edirect_caller.generate_xtract_call` .. include:: ../../links.inc """ ################################################ ## TODO: What to do if multi-isolate sample? ################################################ ## edirect HCGB_aes.boxymcboxface("EDirect information") print("+ Connect to NCBI to get information from samples identified...") ## create dataframe to return results edirect_frame = pd.DataFrame(columns=("sample", "genus", "species", "strain", "BioSample", "genome", "Plasmids")) ## debugging messages if Debug: print("*******************************************************") print("Dataframe sample_results: ") # Group dataframe sample name sample_results = dataFrame.groupby(["Sample"]) for name, grouped in sample_results: ## debugging messages if Debug: print("Name: ", name) print(grouped) ## use edirect to get Species_name and entry for later identification edirect_folder = HCGB_files.create_subfolder('edirect', outdir_dict[name]) ## chromosome match if (len(grouped.loc[grouped['Database'] == 'bacteria.ATG'] ['#Template']) == 0): if Debug: print("Name: ", name) print("No chromosome match identified by kmer") genus = '' species = '' BioSample_name = '' AssemblyAcc = '' else: nucc_entry = grouped.loc[grouped['Database'] == 'bacteria.ATG'][ '#Template'].values[0].split() ## e.g. NZ_CP029680.1 Staphylococcus aureus strain AR_0215 chromosome, complete genome ## out_docsum_file = edirect_folder + '/nuccore_docsum.txt' tmp_species_outfile = edirect_folder + '/info.csv' filename_stamp = edirect_folder + '/.success_species' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) status = True else: edirect_caller.generate_docsum_call('nuccore', nucc_entry[0], out_docsum_file) status = edirect_caller.generate_xtract_call( out_docsum_file, 'DocumentSummary', 'Organism,BioSample,AssemblyAcc,Strain', tmp_species_outfile) ######################################## ## get information from edirect call ######################################## if not status: print("NO INFORMATION") continue taxa_name_tmp = HCGB_main.get_info_file(tmp_species_outfile) Organism = taxa_name_tmp[0].split(',')[0].split() genus = Organism[0] ## genus species = Organism[1] ## species BioSample_name = taxa_name_tmp[0].split(',')[1] ## BioSample AssemblyAcc = taxa_name_tmp[0].split(',')[2] ## AssemblyAcc ## sometimes strain is missing if len(taxa_name_tmp[0].split(',')) > 3: strain = taxa_name_tmp[0].split(',')[3] ## strain else: strain = 'NaN' ## get GenBank accession ID out_docsum_file_assembly = edirect_folder + '/assembly_docsum.txt' AssemblyAcc_outfile = edirect_folder + '/AssemblyAcc.csv' edirect_caller.generate_docsum_call('assembly', AssemblyAcc, out_docsum_file_assembly) edirect_caller.generate_xtract_call(out_docsum_file_assembly, 'DocumentSummary', 'Genbank', AssemblyAcc_outfile) ## some error occurred if not HCGB_main.is_non_zero_file(out_docsum_file_assembly): continue ## Is it better to download Refseq or Genbank? ## https://www.quora.com/What-is-the-difference-between-Refseq-and-Genbank GenbankAcc = HCGB_main.get_info_file(AssemblyAcc_outfile) if Debug: print("Sample: ", name) print("Genbank Acc: ", GenbankAcc[0]) ## plasmid match group_plasmid = grouped.loc[grouped['Database'] == 'plasmids.T'] plasmid_entries = group_plasmid['#Template'].tolist() ## e.g. NZ_CP029083.1 Staphylococcus aureus strain AR464 plasmid unnamed1, complete sequence plasmid_entries_str = ",".join([i.split()[0] for i in plasmid_entries]) ## save edirect_frame #("sample", "taxa", strain, genome "BioSample", "Plasmids")) edirect_frame.loc[len(edirect_frame)] = (name, genus, species, strain, BioSample_name, GenbankAcc[0], plasmid_entries_str) stamp = HCGB_time.print_time_stamp(filename_stamp) ## debugging messages if Debug: print("*******************************************************") return (edirect_frame)
def run(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_info.help_fastq_format() exit() elif (options.help_trimm_adapters): ## help on trimm adapters trimmomatic_call.print_help_adapters() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Trimming samples") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default if (options.detached): options.project = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) ## debug message if (Debug): HCGB_aes.debug_message("pd_samples_retrieved", 'yellow') HCGB_main.print_all_pandaDF(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for samples outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "trimm", options.debug) ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) print("+ Trimming adapters for each sample retrieved...") # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) # Trimming adapters if (options.adapters): # Adapter file provided options.adapters = os.path.abspath(options.adapters) print("\t- Adapters file provided...") else: # Get default adpaters file print("\t- Default Trimmomatic adapters (v0.39) will be used...") options.adapters = data_files.data_list( "available_Trimmomatic_adapters") ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(trimmo_caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, Debug, options.adapters): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("\n\n+ Trimming samples has finished...") ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_total) ## get files generated and generate symbolic link if not options.project: dir_symlinks = HCGB_files.create_subfolder('link_files', outdir) files2symbolic = [] folders = os.listdir(outdir) ## debug message if (Debug): print( colored( "**DEBUG: generate symbolic links for each file in " + dir_symlinks + "**", 'yellow')) for fold in folders: if fold.endswith(".log"): continue else: this_folder = outdir + '/' + fold subfiles = os.listdir(this_folder) for files in subfiles: files_search = re.search( r".*trim_R\d{1}.*", files) ## only paired-end. Todo: single end if files_search: files2symbolic.append(this_folder + '/' + files) HCGB_files.get_symbolic_link(files2symbolic, dir_symlinks) if (options.skip_report): print("+ No report generation...") else: print("\n+ Generating a report using MultiQC module.") outdir_report = HCGB_files.create_subfolder("report", outdir) ## call multiQC report module givenList = [v for v in outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): HCGB_aes.debug_message("my_outdir_list for multiqc report", "yellow") print(my_outdir_list) print("\n") trimm_report = HCGB_files.create_subfolder("trimm", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "Trimmomatic", trimm_report, "") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % trimm_report) ## create fastqc for trimmed reads pd_samples_retrieved_trimmed = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, start_time_partial, "trimmed", Debug) print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("\n+ Exiting trimm module.") return ()
def ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, start_time_partial): HCGB_aes.boxymcboxface("ARIBA Identification") ################## ## check status ## ################## databases2use = [] ## path, db name card_trick_info = "" print('+ Check databases status: ') for index, db2use in retrieve_databases.iterrows(): ## index_name if (db2use['source'] == 'ARIBA'): index_status = ariba_caller.check_db_indexed(db2use['path'], 'YES') if (index_status == True): #print (colored("\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green')) databases2use.append([db2use['path'], db2use['db']]) ## prepare card database ontology for later if (db2use['db'] == 'card'): card_trick_info = card_trick_caller.prepare_card_data( options.database) ## check status of other databases if any # else: ## debug message if (Debug): print(colored("**DEBUG: databases2use\n**", 'yellow')) print(databases2use) if (card_trick_info): print( colored("**DEBUG: card_trick_info: " + card_trick_info + " **", 'yellow')) ###################################################### ## Start identification of samples ###################################################### print("\n+ Send ARIBA identification jobs...") ## get outdir folders outdir_samples = pd.DataFrame(columns=('sample', 'dirname', 'db', 'output')) # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) for name, cluster in sample_frame: for db2use in databases2use: tmp = get_outfile(outdir_dict[name], name, db2use[0]) outdir_samples.loc[len(outdir_samples)] = (name, outdir_dict[name], db2use[1], tmp) ## multi-index outdir_samples = outdir_samples.set_index(['sample', 'db']) ## debug message if (Debug): print(colored("**DEBUG: outdir_samples **", 'yellow')) print(outdir_samples) ###################################################### ## send for each sample ###################################################### ## ariba assembly cutoff if not (options.ARIBA_cutoff): options.ARIBA_cutoff = 0.90 ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## loop results_df = pd.DataFrame() with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: for db2use in databases2use: print(colored("+ Working with database: " + db2use[1], 'yellow')) ## send for each sample commandsSent = { executor.submit( ariba_run_caller, db2use[0], db2use[1], ## database path & dbname sorted(cluster["sample"].tolist()), ## files outdir_samples.loc[(name, db2use[1]), 'output'], ## output threads_job, options.ARIBA_cutoff): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("+ Jobs finished for database %s ..." % db2use[1]) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print() print( "+ Collecting information for each sample analyzed for database: " + db2use[1]) ## check results for each database results_df_tmp = virulence_resistance.check_results( db2use[1], outdir_samples, options.ARIBA_cutoff, card_trick_info) results_df = pd.concat([results_df, results_df_tmp]) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ###################################################### ## Generate final report for all samples ###################################################### ## ariba summary results all samples print( "\n + Generate a summary file for all samples and one for each database employed..." ) ## parse results if Project: final_dir = input_dir + '/report/profile' HCGB_files.create_folder(final_dir) else: final_dir = os.path.abspath(options.output_folder) ## vfdb = False subfolder = HCGB_files.create_subfolder("ariba_summary", final_dir) ## subfolder_samples = functions.create_subfolder("samples", final_dir) ## TODO: Copy all xlsx files to a common folder. Is it necessary? ## open excel writer name_excel = final_dir + '/profile_summary.xlsx' writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') for database, data in outdir_samples.groupby(level='db'): ## fix report_files_databases = {} for sample, data2 in data.groupby(level='sample'): ## fix file_report = data2.loc[sample, database]['output'] + '/report.tsv' if os.path.isfile(file_report): ## check if exists report_files_databases[sample] = file_report outfile_summary = subfolder + "/" if database.endswith('card_prepareref/'): outfile_summary = outfile_summary + 'CARD_summary' name_db = 'CARD' elif database.endswith('vfdb_full_prepareref/'): outfile_summary = outfile_summary + 'VFDB_summary' name_db = 'VFDB' vfdb = True else: ## TODO: check if there are multiple 'other' databases ## Different databases provided (different to VFDB and CARD) would collapse file outfile_summary = outfile_summary + 'Other_summary' name_db = 'other' ## call ariba summary to summarize results csv_all = ariba_caller.ariba_summary_all(outfile_summary, report_files_databases) if not csv_all == 'NaN': csv2excel = pd.read_csv(csv_all, header=0, sep=',') ## write excel name_tab = name_db + '_found' csv2excel.to_excel(writer, sheet_name=name_tab) ## results_df contains excel and csv files for each sample and for each database list_databases = set(results_df['database'].to_list()) for db in list_databases: df_db = results_df[results_df['database'] == db]['csv'] dict_samples = df_db.to_dict() merge_df = pd.DataFrame() for sample in dict_samples: if os.path.isfile(dict_samples[sample]): df = pd.read_csv(dict_samples[sample], header=0, sep=",") df = df.set_index('Genes') df2 = df.rename(columns={'Status': sample}, inplace=True) df2 = df[[sample]] ## add to a common dataframe merge_df = pd.concat([merge_df, df2], axis=1, sort=True) merge_df.fillna("NaN", inplace=True) trans_df = merge_df.transpose() ## write excel name_tab = db + '_all' trans_df.to_excel(writer, sheet_name=name_tab) ## close writer.save() ###################################################### ## print additional information for VFDB ###################################################### if (vfdb): print("\n\n") HCGB_aes.print_sepLine("*", 50, False) print("+ Check VFDB details in files downloaded from vfdb website:") files_VFDB = virulence_resistance.check_VFDB(final_dir + '/VFDB_information') HCGB_aes.print_sepLine("*", 50, False) ###################################################### print("\n+ Please check additional summary files generated at folder ", final_dir) print("+ Go to website: https://jameshadfield.github.io/phandango/#/") print( "+ For each database upload files *phandango.csv and *phandango.tre and visualize results" )
def MLST_ident(options, dataFrame, outdir_dict, dataFrame_edirect, retrieve_databases): """Generate MLST profile identification This functions uses the `MLSTar software`_ to retrieve Multi locus sequence typing (MLST) profiles from PubMLST_ for the given species previously identified by KMA. It generates MLST profiling for each sample. :param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in... :param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :param dataFrame_edirect: pandas dataframe resulted from :func:`BacterialTyper.modules.ident.edirect_ident`. :param retrieve_databases: :type options: :type dataFrame: pandas.DataFrame() :type outdir_dict: Dictionary :type dataFrame_edirect: pandas.DataFrame() :type retrieve_databases: pandas.DataFrame() :return: Information of the MLST identification. Dictionary keys are samples and values are the absolute path to file generate by :func:`BacterialTyper.scripts.MLSTar.run_doMLST` containing MLST information. :rtype: Dictionary See example of returned dataframe in file :file:`/devel/results/doMLST_result_example.csv` here: .. include:: ../../devel/results/doMLST_result_example.csv :literal: .. seealso:: Additional information to PubMLST available datasets. - :doc:`PubMLST datasets<../../../data/PubMLST_datasets>` .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.functions.create_subfolder` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.scripts.MLSTar.run_MLSTar` - :func:`HCGB.sampleParser.files.get_files` - :func:`BacterialTyper.scripts.MLSTar.get_MLSTar_species` .. include:: ../../links.inc """ ## set config rscript = set_config.get_exe("Rscript") ## TODO: Samples might not be assembled...to take into account and return 0 ## TODO: Fix and install MLSTar during installation print(MLSTar.get_MLSTar_package_installed()) exit() ######################################################################################## ## TODO: What to do if multi-isolate sample? ## TODO: Control if a different profile is provided via --MLST_profile ## TODO: Check time passed and download again if >?? days passed] ## debug message if (Debug): print(colored("**DEBUG: dataFrame_edirect identified**", 'yellow')) print(dataFrame_edirect) ## MLST call HCGB_aes.boxymcboxface("MLST typing") print( "+ Create classical MLST typification of each sample according to species retrieved by kmer..." ) ## get assembly files input_dir = os.path.abspath(options.input) assembly_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: assembly_samples_retrieved**", 'yellow')) print(assembly_samples_retrieved) # init MLST_results = {} ## get MLST_profile: default or provided mlst_profile_list = retrieve_databases.loc[retrieve_databases['db'] == 'PubMLST']['path'].tolist() if (Debug): print("** Debug **") print("mlst_profile_list") print(mlst_profile_list) print("dataFrame_edirect") print(dataFrame_edirect) ## Generate MLST call according to species identified for each sample for index, row in dataFrame_edirect.iterrows(): MLSTar_taxa_name = MLSTar.get_MLSTar_species(row['genus'], row['species']) if (MLSTar_taxa_name == 'NaN'): print( colored( "\t- Not available PubMLST profile for sample [%s] identified as %s %s" % (row['sample'], row['genus'], row['species']), 'yellow')) else: for mlst_profile in mlst_profile_list: ## species folder #species_mlst_folder = functions.create_subfolder(MLSTar_taxa_name, pubmlst_folder) species_mlst = mlst_profile.split(',')[0] species_mlst_folder = mlst_profile.split(',')[1] ## output file output_file = species_mlst_folder + '/PubMLST_available_scheme.csv' filename_stamp = species_mlst_folder + '/.success_scheme' ## if MLSTar_taxa_name == species_mlst: if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s" % stamp, 'yellow')) else: ### get scheme available MLSTar.getPUBMLST(MLSTar_taxa_name, rscript, output_file) stamp = HCGB_time.print_time_stamp(filename_stamp) ## parse and get scheme for classical MLST schemes_MLST = pd.read_csv(output_file, sep=',', header=0) ## for item, cluster in schemes_MLST.iterrows(): if cluster['len'] < 10: scheme2use = int(cluster['scheme']) continue ### sample = row['sample'] MLSTar_folder = HCGB_files.create_subfolder( 'MLST', outdir_dict[sample]) genome_file = assembly_samples_retrieved.loc[ assembly_samples_retrieved['name'] == sample]['sample'].values[0] ## call MLST (results, profile_folder) = MLSTar.run_MLSTar( species_mlst_folder, rscript, MLSTar_taxa_name, scheme2use, sample, MLSTar_folder, genome_file, options.threads) MLST_results[sample] = results ## print("+ Finish this step...") return (MLST_results)
def run_cluster(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_Mash): ## information for Min Hash Software min_hash_caller.helpMash() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Clustering samples") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files if options.reads: if options.noTrim: ## raw reads pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: ## trimm reads pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## keep only R1 reads if paired-end if options.pair: pd_samples_retrieved = pd_samples_retrieved.loc[ pd_samples_retrieved['read_pair'] == "R1"] else: ## default pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) # exit if empty if pd_samples_retrieved.empty: print( "No data has been retrieved from the project folder provided. Exiting now..." ) exit() ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for each sample outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "mash", options.debug) ## debug message if (Debug): print(colored("**DEBUG: outdir_dict **", 'yellow')) print(outdir_dict) ## get databases to check retrieve_databases = get_options_db(options) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## remove samples if specified if options.ex_sample: ex_samples = HCGB_main.get_info_file(options.ex_sample) retrieve_databases = retrieve_databases.loc[~retrieve_databases.index. isin(ex_samples)] ## debug message if (Debug): print(colored("**DEBUG: retrieve_database **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print(retrieve_databases) ## check if all samples in user_data or genbank are indexed siglist_all = [] for index, row in retrieve_databases.iterrows(): if not row['path'] == 'NaN': if (Debug): HCGB_aes.print_sepLine("*", 25, False) print(row) if all([ int(options.kmer_size) == int(row['ksize']), int(options.n_sketch) == int(row['num_sketch']) ]): siglist_all.append( min_hash_caller.read_signature(row['path'], options.kmer_size)) continue ## index assembly or reads... (sigfile, siglist) = generate_sketch(row['folder'], row['original'], index, options.kmer_size, options.n_sketch, Debug) retrieve_databases.loc[index]['path'] = sigfile retrieve_databases.loc[index]['ksize'] = options.kmer_size retrieve_databases.loc[index]['num_sketch'] = options.n_sketch siglist_all.append(siglist) ### Cluster project samples print(colored("\n+ Collect project data", 'green')) print("+ Generate mash sketches for each sample analyzed...") pd_samples_retrieved = pd_samples_retrieved.set_index('name') ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieved **", 'yellow')) print(pd_samples_retrieved) ## init dataframe for project data colname = ["source", "name", "path", "original", "ksize", "num_sketch"] pd_samples_sketched = pd.DataFrame(columns=colname) for index, row in pd_samples_retrieved.iterrows(): if index in retrieve_databases.index: print( colored( '\t+ Sketched signature (%s) available within user data...' % index, 'yellow')) continue this_sig = outdir_dict[index] + '/' + index + '.sig' if os.path.exists(this_sig): ## File signature might exist ## read original file2print = outdir_dict[index] + '/.original' if not os.path.exists(file2print): original = ['NaN'] else: original = HCGB_main.readList_fromFile(file2print) if all([ int(options.kmer_size) == int(original[1]), int(options.n_sketch) == int(original[2]) ]): siglist_all.append( min_hash_caller.read_signature(this_sig, options.kmer_size)) pd_samples_sketched.loc[len(pd_samples_sketched)] = ( 'project_data', index, this_sig, row['sample'], options.kmer_size, options.n_sketch) print( colored( '\t+ Sketched signature available (%s) in project folder...' % index, 'green')) continue print( colored('\t+ Sketched signature to be generated: (%s)...' % index, 'yellow')) ## index assembly or reads... (sigfile, siglist) = generate_sketch(outdir_dict[index], row['sample'], index, options.kmer_size, options.n_sketch, Debug) pd_samples_sketched.loc[len(pd_samples_sketched)] = ('project_data', index, sigfile, row['sample'], options.kmer_size, options.n_sketch) siglist_all.append(siglist) print("\n+ Clustering sequences...") pd_samples_sketched = pd_samples_sketched.set_index('name') #### if retrieve_databases.empty: cluster_df = pd_samples_sketched else: tmp = retrieve_databases[[ 'source', 'db', 'path', 'original', 'ksize', 'num_sketch' ]] tmp = tmp.rename(columns={'db': 'name'}) tmp.set_index('name') if (Debug): print(colored("**DEBUG: tmp **", 'yellow')) print(tmp) ## merge both dataframes cluster_df = pd.concat([pd_samples_sketched, tmp], join='inner', sort=True) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_sketched **", 'yellow')) print(pd_samples_sketched) print(colored("**DEBUG: cluster_df **", 'yellow')) print(cluster_df) print(colored("**DEBUG: Signatures **", 'yellow')) print(siglist_all) print(colored("**DEBUG: length siglist_all **", 'yellow')) print(len(siglist_all)) ## Assign Colors colorLabels color_df = cluster_df.filter(["source"], axis=1) color_df["color"] = "r" ## red::genbank ## project data project_data = list(color_df[color_df["source"] == "project_data"].index) color_df.loc[color_df.index.isin(project_data), "color"] = "g" ## green::project_data ## user_data user_data = list(color_df[color_df["source"] == "user_data"].index) color_df.loc[color_df.index.isin(user_data), "color"] = "b" ## blue::user_data colorLabels = color_df['color'].to_dict() if Debug: print(color_df) print(colorLabels) ## parse results if options.project: outdir_report = HCGB_files.create_subfolder("report", outdir) #final_dir = outdir + '/report/cluster' final_dir = functions.create_subfolder("cluster", outdir_report) else: final_dir = outdir ## compare name = 'cluster_' + str(HCGB_time.create_human_timestamp()) tag_cluster_info = final_dir + '/' + name print('+ Saving results in folder: ', final_dir) print('\tFile name: ', name) (DataMatrix, labeltext) = min_hash_caller.compare(siglist_all, tag_cluster_info, Debug) ## get colorLabels ## plot images pdf = True cluster_returned = min_hash_caller.plot(DataMatrix, labeltext, tag_cluster_info, pdf, colorLabels) ## generate newick tree min_hash_caller.get_Newick_tree(cluster_returned, DataMatrix, labeltext, tag_cluster_info) return ()
def main(): ## control if options provided or help if len(sys.argv) > 1: print ("") else: help_options() exit() file1 = os.path.abspath(argv[1]) file2 = os.path.abspath(argv[2]) sample = argv[3] SPADES_bin = argv[4] threads = int(argv[5]) path = argv[6] folder = HCGB_files.create_subfolder(sample, path) ## assembly main path_to_contigs = run_SPADES_assembly(folder, file1, file2, sample, SPADES_bin, threads, debug=True) ## assembly plasmids path_to_plasmids = run_SPADES_plasmid_assembly(folder, file1, file2, sample, SPADES_bin, threads) ## discard plasmids from main tmp_contigs, tmp_plasmids = discardPlasmids(path_to_contigs, path_to_plasmids, folder, sample) ## rename fasta sequences new_contigs_list = tmp_contigs.split(".tmp") new_contigs = new_contigs_list[0] rename_contigs(tmp_contigs, "scaffolds_chr", new_contigs) new_plasmids="" if os.path.isfile(tmp_plasmids): new_plasmids_list = tmp_plasmids.split(".tmp") new_plasmids = new_plasmids_list[0] rename_contigs(tmp_plasmids, "scaffolds_plasmids", new_plasmids) ## generate contig statistics print ('+ Get assembly statistics:...\n') ## get contig statistics contig_out = contig_stats(new_contigs, True) contig_out_file = open(contig_out, 'r') contig_out_file_read = contig_out_file.read() contig_out_file.close() ## dump in screen print (contig_out_file_read) print () if (new_plasmids == 'FAIL'): print ('+ No plasmids identified...\n') else: print ('+ Plasmids assembly') plasmid_out = contig_stats(new_plasmids, True) ## dump in screen plasmid_out_file = open(plasmid_out, 'r') plasmid_file_read = plasmid_out_file.read() plasmid_out_file.close() print(plasmid_file_read)