def help_ARIBA(): dict_ariba = citation.ariba_citation() ## to do: finish filling information for different databases print ("") HCGB_aes.print_sepLine("*", 50, False) print ("CARD:") print ("The Comprehensive Antibiotic Resistance Database (CARD) is a rigorously curated collection of characterized, peer-reviewed resistance determinants and associated antibiotics, organized by the Antibiotic Resistance Ontology (ARO) and AMR gene detection models.") print ('Citation:', dict_ariba['CARD']) print ("") HCGB_aes.print_sepLine("*", 50, False) print ("VFDB:") print ("The virulence factor database (VFDB) is an integrated and comprehensive online resource for curating information about virulence factors of bacterial pathogens. Since its inception in 2004, VFDB has been dedicated to providing up-to-date knowledge of VFs from various medically significant bacterial pathogens.") print ('Citation:', dict_ariba['VFDB']) print ("") HCGB_aes.print_sepLine("*", 50, False) print ("ARG-ANNOT:\n") print ("...") print ('Citation:', dict_ariba['ARG-ANNOT']) print ("") HCGB_aes.print_sepLine("*", 50, False) print ("MEGARes:") print ("The MEGARes database contains sequence data for approximately 4,000 hand-curated antimicrobial resistance genes accompanied by an annotation structure that is optimized for use with high throughput sequencing.") print ('Citation:', dict_ariba['MEGARes']) print ("") HCGB_aes.print_sepLine("*", 50, False) print ("PlasmidFinder:\n") print ("...") print ('Citation:', dict_ariba['PlasmidFinder']) print ("") HCGB_aes.print_sepLine("*", 50, False) print ("ResFinder:\n") print ("The ResFinder database is a curated database of acquired resistance genes.") print ('Citation:', dict_ariba['ResFinder']) print ("") HCGB_aes.print_sepLine("*", 50, False) print ("srst2:") print ("...") print ('Citation:', dict_ariba['srst2']) print ("") HCGB_aes.print_sepLine("*", 50, False) print ("")
def getdbs(source, database_folder, option, debug): """Get databases available within the folder provided. :param source: Type of database to search: ARIBA, KMA, NCBI, MLST, user_data :param database_folder: Absolute path to database folder. :param option: String containing multiple entries separated by '#' that indicate the type of database entries to search within each source type. :param debug: True/False for debugging messages. :type source: string :type database_folder: string :type option: string :type debug: bool :returns: Dataframe containing absolute paths to the available databases for each type requested. It contains columns for: "source", "db", "path" e.g.: source = KMA option = kma:archaea,plasmids,bacteria#kma_external:/path/to/file1,/path/to/file2#user_data#genbank ** e.g.: source = NCBI option = genbank """ ## init dataframe colname = ["source", "db", "path"] db_Dataframe = pd.DataFrame(columns=colname) ## read folders within database if os.path.isdir(database_folder): files = os.listdir(database_folder) ## ARIBA/KMA_db/genbank/user_data else: return db_Dataframe ## debug message if (debug): print(colored("Folders: " + str(files), 'yellow')) print() ## user input dbs2use = [] option_list = option.split("#") for option_item in option_list: ## debug message if (debug): print(colored("Option item: " + option_item, 'yellow')) ### dbs2use_tmp = [] ## kma if (option_item.startswith('kma')): if (option_item.startswith('kma:')): dbs2use_tmp = option_item.split(":")[1].split(",") elif (option_item.startswith('kma_external:')): external = option_item.split(":")[1].split(",") ## add to dataframe for ext in external: name_ext = os.path.basename(ext) db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_External', name_ext, ext ] elif (option_item.startswith('kma_user_data:')): dbs2use_tmp = option_item.split(":")[1].split(",") elif (option_item.startswith('kma_NCBI:')): dbs2use_tmp = option_item.split(":")[1].split(",") ### ARIBA elif (option_item.startswith('ARIBA:')): dbs2use = option_item.split(":")[1].split(",") ### NCBI: genbank elif (option_item.startswith('genbank')): dbs2use.append('genbank') ### NCBI: taxonomy ID elif (option_item.startswith('tax_id')): dbs2use.append('taxonomy_id') ### user_data elif (option_item.startswith('user_data')): dbs2use.append('user_data') ### MLST elif (option_item.startswith('MLST')): dbs2use_tmp = option_item.split(":")[1].split(",") ### Mash elif (option_item.startswith('Mash')): if (option_item.startswith('Mash_external_data:')): external = option_item.split(":")[1].split(",") ## add to dataframe for ext in external: name_ext = os.path.basename(ext) name_ext_ = name_ext.split('.fna')[0] db_Dataframe.loc[len(db_Dataframe)] = [ 'Mash_external', name_ext_, ext ] else: dbs2use_tmp = option_item.split(":")[1].split(",") ### Other? else: dbs2use.append( option_item ) ## add ARIBA, user_data or genbank option if provided ## get all dbs2use = dbs2use + dbs2use_tmp ## debug message if (debug): print(colored("\ndbs2use:\n\t" + "\n\t".join(dbs2use), 'yellow')) ## init dataframe #colname = ["source", "db", "path"] #db_Dataframe = pd.DataFrame(columns = colname) ############### #### ARIBA #### ############### if (source == 'ARIBA'): ### Check if folder exists ARIBA_folder = HCGB_files.create_subfolder('ARIBA', database_folder) ### get information ARIBA_dbs = ariba_caller.get_ARIBA_dbs(dbs2use) ## get names for ariba_db in ARIBA_dbs: this_db = os.path.join(ARIBA_folder, ariba_db + '_prepareref') if os.path.exists(this_db): code_check_db = ariba_caller.check_db_indexed(this_db, 'NO') if (code_check_db == True): db_Dataframe.loc[len(db_Dataframe)] = [ 'ARIBA', ariba_db, this_db ] print( colored( "\t- ARIBA: including information from database: " + ariba_db, 'green')) else: print("+ Database: ", ariba_db, " is not downloaded...") print("+ Download now:") folder_db = HCGB_files.create_subfolder(ariba_db, ARIBA_folder) code_db = ariba_caller.ariba_getref(ariba_db, folder_db, debug, 2) ## get names if (code_db == 'OK'): db_Dataframe.loc[len(db_Dataframe)] = [ 'ARIBA', ariba_db, this_db ] print( colored( "\t- ARIBA: including information from database: " + ariba_db, 'green')) ############# #### KMA #### ############# elif (source == 'KMA'): ### Check if folder exists KMA_db_abs = HCGB_files.create_subfolder('KMA_db', database_folder) kma_dbs = os.listdir(KMA_db_abs) ## debug message if (debug): print(colored("Folders KMA_db:" + str(kma_dbs), 'yellow')) ### get information for db in dbs2use: this_db = KMA_db_abs + '/' + db ## debug message if (debug): print(colored("this_db:" + this_db, 'yellow')) #### genbank if (db == "genbank"): ## KMA databases exists this_db_file = this_db + '/genbank_KMA' if os.path.isfile(this_db_file + '.comp.b'): print( colored( "\t- genbank: including information from different reference strains available.", 'green')) ## include data from NCBI db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_genbank', 'genbank', this_db_file ] #### user_data elif (db == "user_data"): ## KMA databases exists this_db_file = this_db + '/userData_KMA' if os.path.isfile(this_db_file + '.comp.b'): print( colored( "\t- user_data: including information from user previously generated results", 'green')) ## include user data db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_user_data', 'user_data', this_db_file ] ## default KMA databases: bacteria & plasmids else: ## if (db == 'plasmids'): prefix = '.T' elif (db == 'viral'): prefix = '.TG' else: prefix = '.ATG' this_db_file = os.path.join(this_db, db, db + prefix) ## debug message if (debug): print(colored("this_db_file:" + this_db_file, 'yellow')) if os.path.isfile(this_db_file + '.comp.b'): db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_db', db, this_db_file ] print( colored( "\t- KMA: including information from database " + db, 'green')) else: print( colored("\t**KMA: Database %s was not available." % db, 'red')) ## if missing: call download module print("+ Download missing KMA_db (%s) provided" % db) species_identification_KMA.download_kma_database( os.path.join(database_folder, 'KMA_db', db), db, debug) if os.path.isfile(this_db_file + '.comp.b'): db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_db', db, this_db_file ] print( colored( "\t- KMA: including information from database " + db, 'green')) else: print( colored( "\t**KMA: Database %s was not available." % db, 'red')) ############## #### NCBI #### ############## elif (source == 'NCBI'): ## TODO: get additional information from ## info_file = dir_path + '/info.txt' ### Check if folder exists path_genbank = os.path.join(database_folder, source, 'genbank') db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder) ### genbank entries downloaded if dbs2use[0] == 'genbank': ## if os.path.exists(path_genbank + '/bacteria'): genbank_entries = os.listdir( os.path.join(path_genbank, 'bacteria')) for entry in genbank_entries: this_db = os.path.join(path_genbank, 'bacteria', entry) db_Dataframe.loc[len(db_Dataframe)] = [ 'NCBI:genbank', entry, this_db ] elif dbs2use[0] == 'tax_id': tax_id_entries = db2use_abs ################### #### user_data #### ################### elif (source == 'user_data'): ### Check if folder exists db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder) user_entries = os.listdir(db2use_abs) for entry in user_entries: this_db = db2use_abs + '/' + entry db_Dataframe.loc[len(db_Dataframe)] = ['user_data', entry, this_db] ################# #### PubMLST #### ################# elif (source == 'MLST'): ### get information for db in dbs2use: if db == 'PubMLST': ### Check if folder exists db2use_abs = HCGB_files.create_subfolder( 'PubMLST', database_folder) list_profiles = os.listdir(db2use_abs) for entry in list_profiles: this_db = db2use_abs + '/' + entry db_Dataframe.loc[len(db_Dataframe)] = [ 'MLST', 'PubMLST', entry + ',' + this_db ] print( colored( "\t- MLST: including information from profile: " + entry, 'green')) else: db_Dataframe.loc[len(db_Dataframe)] = [ 'MLST', 'user_profile', db ] print( colored( "\t- MLST: including information from profile provided by user: "******"genbank"): ### Check if folder exists db2use_abs = database_folder + '/NCBI/genbank/bacteria' if os.path.exists(db2use_abs): print( colored( "\n\t- genbank: including information from different reference strains available.", 'green')) ## include data from NCBI genbank_entries = os.listdir(db2use_abs) for entry in genbank_entries: print('\t+ Reading information from sample: ', entry) this_db = db2use_abs + '/' + entry ## get additional information from info_file = this_db + '/info.txt' info_data = pd.read_csv(info_file).set_index('ID') info_data.fillna("NaN", inplace=True) ## get readable name for each strain entry_strain = str(info_data.loc[entry]['name']) if entry_strain == 'NaN': ## TODO: debug if it works entry_strain = entry print() else: print('\t\t+ Rename into: ', entry_strain) list_msh = HCGB_main.retrieve_matching_files( this_db, '.sig', debug) if (list_msh): ## print original in file file2print = this_db + '/.original' if not os.path.exists(file2print): original = ['NaN'] else: original = HCGB_main.readList_fromFile( file2print) db_Dataframe.loc[len(db_Dataframe)] = [ 'genbank', entry_strain, list_msh[0], this_db + '/mash/' + original[0], original[1], original[2], this_db ] else: ## index assembly or reads... list_fna = HCGB_main.retrieve_matching_files( this_db, 'genomic.fna', debug) ## not available db_Dataframe.loc[len(db_Dataframe)] = [ 'genbank', entry_strain, 'NaN', list_fna[0], 'NaN', 'NaN', this_db ] #### user_data elif (db == "user_data"): print( colored( "\n\t- user_data: including information from user previously generated results", 'green')) ## include user data db2use_abs = HCGB_files.create_subfolder( 'user_data', database_folder) user_entries = os.listdir(db2use_abs) for entry in user_entries: if entry == 'user_database.csv': continue print('\t+ Reading information from sample: ', entry) this_db = db2use_abs + '/' + entry this_mash_db = this_db + '/mash/' + entry + '.sig' if os.path.exists(this_mash_db): ## print original in file file2print = this_db + '/mash/.original' if not os.path.exists(file2print): original = ['NaN', 'NaN', 'NaN'] else: original = HCGB_main.readList_fromFile(file2print) ## db_Dataframe.loc[len(db_Dataframe)] = [ 'user_data', entry, this_mash_db, this_db + '/mash/' + original[0], original[1], original[2], this_db + '/mash' ] else: ## not available list_fna = HCGB_main.retrieve_matching_files( this_db + '/assembly', '.fna', debug) db_Dataframe.loc[len(db_Dataframe)] = [ 'user_data', entry, 'NaN', list_fna[0], 'NaN', 'NaN', this_db + '/mash' ] #### external_data ### TODO: Fix this mash_bin = "" #set_config.get_exe('mash') if any(name in 'Mash_external' for name in db_Dataframe['source'].to_list()): print( colored( "\t- external_data: including information from external data provided by user", 'green')) ## include user data db_Dataframe = db_Dataframe.set_index("db", drop=False) frame = db_Dataframe[db_Dataframe['source'] == 'Mash_external'] for index, row in frame.iterrows(): print('\t+ Reading information for file: ', row['db']) outfile = row['path'] + '.msh' if not os.path.exists(outfile): path_file = os.path.dirname(row['path']) this_db_file = min_hash_caller.sketch_database([row['path']], mash_bin, row['path'], row['db'], path_file) HCGB_aes.print_sepLine("*", 50, False) db_Dataframe.loc[row['db']] = [ 'Mash_external', row['db'], outfile, row['path'] ] ## index by id db_Dataframe = db_Dataframe.set_index("db", drop=False) return (db_Dataframe)
def run_search(arg_dict): """Main function of the search module in BacDup package. This module searches and create gene duplication analysis. It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user annotation data) or a single or multiple samples. """ ## help message if (arg_dict.input_help): help_input() exit() if (arg_dict.blast_help): info.blast_help() exit() if (arg_dict.project_help): info.project_help() exit() if (arg_dict.detached_mode_help): info.detached_mode() exit() ### Start the analysis BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Search module") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out outdir = os.path.abspath(arg_dict.input_folder) ## project or detached? if arg_dict.detached: arg_dict.project = False ## output folder print("\n+ Create output folder(s):") HCGB.functions.files_functions.create_folder(outdir) else: arg_dict.project = True ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting information provided... ') print('+ Several options available:') print('\t* BacDup project folder with initiated data') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\t* Single/Multiple raw BLAST results files') print('\t* Single/Multiple fasta proteins + annotation table') print("""\n\n**** NOTE: **** For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs) use the input module to accommodate accordingly """) time.sleep(1) print() ## parse options pd_samples_retrieved = parse_search_options(arg_dict) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## for each sample dict_search_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "search", arg_dict.debug) dict_dup_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug) dict_parse_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "parse", arg_dict.debug) ## create results data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table()) for sample, folder in dict_search_folders.items(): annot_timestamp = os.path.join(dict_dup_folders[sample], '.annot_success') dup_annot_file = os.path.join(dict_dup_folders[sample], 'dup_annot.csv') ## annotation annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table'] if (not HCGB.functions.files_functions.is_non_zero_file( annot_timestamp)): ## get results file_data = pd_samples_retrieved.loc[sample, 'file_data'] format = pd_samples_retrieved.loc[sample, 'format'] filtered_data = dup_searcher.filter_data( sample, file_data, format, arg_dict.pident, arg_dict.evalue, arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug) ## timestamps filter_timestamp = os.path.join(dict_dup_folders[sample], '.filter_success') if (not HCGB.functions.files_functions.is_non_zero_file( filter_timestamp)): #save results as a .csv file sort_csv = os.path.abspath( os.path.join(dict_dup_folders[sample], 'filtered_results.csv')) filtered_data.to_csv(sort_csv, header=True, index=False) ## print time stamp HCGB_time.print_time_stamp(filter_timestamp) else: read_time = HCGB_time.read_time_stamp(filter_timestamp) print( colored( "\t+ Filter results already available for sample %s [%s]" % (sample, read_time), 'green')) ## get annotation (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot( sample, filtered_data, annot_table_file, arg_dict.debug) ## info_dup_file = os.path.join(dict_dup_folders[sample], 'info_dup.csv') data2add_entry.to_csv(info_dup_file, header=True, index=False) ## save into file dup_annot_df.to_csv(dup_annot_file, header=True) ## print time stamp HCGB_time.print_time_stamp(annot_timestamp) else: read_time = HCGB_time.read_time_stamp(annot_timestamp) print( colored( "\t+ Duplicate annotation already available for sample %s [%s]" % (sample, read_time), 'green')) ## add info for each dup_annot_df = HCGB_main.get_data(dup_annot_file, ',', "index_col=0") annot_table = HCGB_main.get_data(annot_table_file, ',', "index_col=0") data2add_entry = dup_searcher.get_dup_stats( sample, dup_annot_df, annot_table, arg_dict.debug) ## add genome length data data2add_entry['genome_len'] = '' len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv') if os.path.isfile(len_df_file): len_data = HCGB_main.get_data(len_df_file, ',', "header=None") data2add_entry['genome_len'] = len_data[1].sum() ## merge data #data2add_entry = data2add_entry.reset_index() data2add = data2add.append(data2add_entry, ignore_index=False) ### report generation HCGB_aes.boxymcboxface("Summarizing duplicated search") outdir_report = HCGB.functions.files_functions.create_subfolder( "report", outdir) dups_report = HCGB.functions.files_functions.create_subfolder( "dups", outdir_report) ## add data2add data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting search module.") return ()
def download_ariba_databases(list_dbs, main_folder, Debug, threads): """Download ARIBA_ databases. Using ARIBA software this function retrieves desired databases and prepare them for later analysis. :param list_dbs: List of databases to download. :param main_folder: Absolute path to database folder. :param Debug: True/false for printing developer messages :param threads: Number of CPUs to use. :type list_dbs: string :type main_folder: string :type Debug: Boolean :type threads: integer .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.file_functions.create_subfolder` - :func:`HCGB.functions.time_functions.read_time_stamp` - :func:`BacterialTyper.scripts.ariba_caller.get_ARIBA_dbs` - :func:`BacterialTyper.scripts.ariba_caller.ariba_getref` .. include:: ../../links.inc """ print("\n\n+ Download databases for Antimicrobial Resistance Identification By Assembly (ARIBA).") ariba_folder = HCGB_files.create_subfolder("ARIBA", main_folder) ## print ARIBA databases: print ("+ Available databases:") dbs = get_ARIBA_dbs(list_dbs) for db_set in dbs: HCGB_aes.print_sepLine("-",30, False) print (colored("+ " + db_set,'yellow')) ## prepare folders folder_set = HCGB_files.create_subfolder(db_set, ariba_folder) outdir_prepare_ref = folder_set + '_prepareref' ## stamp time file filename_stamp_prepare = outdir_prepare_ref + '/.success' ## check if previously done if os.path.isfile(filename_stamp_prepare): stamp = HCGB_time.read_time_stamp(filename_stamp_prepare) print ("\t+ Database is downloaded in folder: ", folder_set) print ("\t+ Data is available and indexed in folder: ", outdir_prepare_ref) print (colored("\tDatabase was previously downloaded and prepared on: %s" %stamp, 'yellow')) ## Check if necessary to download again after several months/days days_passed = HCGB_time.get_diff_time(filename_stamp_prepare) print ("\t\t** %s days ago" %days_passed) if (days_passed > 30): ## download again print ("\t\t** Downloading information again just to be sure...") return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads) else: return_ariba_getref = 'OK' else: return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads) if (return_ariba_getref == 'OK'): print() else: print (colored("** ARIBA getref failed or generated a warning for " + db_set, 'red'))
def run_input(arg_dict): """Main function of the input_parser module in BacDup package. This module prepares data for later gene duplication analysis. It allows the user to provide either a single sample, multiple samples, NCBI GenBank IDs or NCBI taxonomy IDs to retrieve and obtain the annotation data. """ ## help message if (arg_dict.input_help): help_input() exit() BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Preparing input files") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out #input_dir = os.path.abspath(options.input) outdir = os.path.abspath(arg_dict.output_folder) ## output folder print("\n+ Create output folder(s):") HCGB_files.create_folder(outdir) ## set defaults if not (arg_dict.assembly_level): arg_dict.assembly_level = 'complete' if not (arg_dict.section): arg_dict.section = 'genbank' ## project or detached? if arg_dict.detached: arg_dict.project = False final_dir = outdir data_dir = outdir else: arg_dict.project = True print( "+ Generate a directory containing information within the project folder provided" ) final_dir = HCGB_files.create_subfolder("info", outdir) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('final_dir:' + final_dir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting input information provided... ') print('+ Several options available:') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\n\t* Single/Multiple NCBI GenBank IDs') print('\n\t* Single/Multiple NCBI taxonomy IDs + Options') print('\n\t* A previous BacDup project folder') print('\n+ Check the option provided...') time.sleep(1) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ################################################# ## Parse and obtain the type of input information provided ################################################# df_accID = parse_options(arg_dict) ## pd.DataFrame: 'new_name','folder','genus', ## 'species','taxonomy','genome', ## 'annot_file','format_annot_file', 'proteins', ## 'plasmids_number','plasmids_ID')) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## parse information accordingly parse_information(arg_dict, df_accID, outdir) ### report generation HCGB_aes.boxymcboxface("Summarizing input files") outdir_report = HCGB_files.create_subfolder("report", outdir) input_report = HCGB_files.create_subfolder("input", outdir_report) ## add df_accID.loc[sample,] information as csv into input folder df_accID.to_csv(os.path.join(input_report, 'info.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Input module.") return ()
def get_options_db(options): """Select databases to use according to the input options. :param options: :returns: Dataframe with database information among all databases available. """ print("\n\n+ Select databases to use for identification:") ### database folder to use database2use = os.path.abspath(options.database) ## debug message if (Debug): print( colored("**DEBUG: Database to use: " + database2use + " **", 'yellow')) ## according to user input: select databases to use option_db = "" ############################################################ ## Default db KMA ############################################################ kma_dbs = [] if not options.only_kma_db: ## exclusive #kma_dbs = ["bacteria", "plasmids"] kma_dbs = ["bacteria"] if (options.kma_dbs): options.kma_dbs = options.kma_dbs + kma_dbs options.kma_dbs = set(options.kma_dbs) else: options.kma_dbs = kma_dbs ## rise error & exit if no dbs provided if not (options.kma_dbs): print( colored("***ERROR: No database provided via --kma_db option.\n", 'red')) exit() ############################################################ ### Options: ############ ## 1) only user data: previously identified and added ############ if (options.only_user_data): option_db = "user_data" ############ ## 2) only genbank data: previously download from NCBI reference genomes ############ elif (options.only_genbank_data): option_db = "genbank" ############ ## 3) only external kma ############ elif (options.only_external_kma): option_db = get_external_kma(options.kma_external_files, Debug) ## rise attention if (options.kma_dbs): print( colored( "***ATTENTION:\nDefatult databases and databases provided via --kma_dbs option would not be used as --only_external_kma option provided.\n", 'red')) ################# ## all databases ################# else: #################### ## default KMA dbs #################### print('\t- Selecting kma databases:') kma_dbs_string = ','.join(options.kma_dbs) option_db = "kma:" + kma_dbs_string for i in options.kma_dbs: print(colored('\t\t+ %s' % i, 'green')) ################# ## External file ################# if (options.kma_external_files): option_db_tmp = get_external_kma(options.kma_external_files, Debug) option_db = option_db + '#' + option_db_tmp ############################# ## Previously identified data ############################# if any([options.user_data, options.all_data]): option_db = option_db + '#kma_user_data:user_data' ############################# ## Genbank reference data ############################# if any([options.genbank_data, options.all_data]): option_db = option_db + '#kma_NCBI:genbank' ############### ### PubMLST ### ############### print("\n\t - Select MLST profiles") option_db_PubMLST = 'MLST:PubMLST' print( colored("\t\t + Default MLST profile under database provided: PubMLST", 'green')) if options.MLST_profile: ## user provides a PubMLST profile options.MLST_profile = os.path.abspath(options.MLST_profile) option_db_PubMLST = option_db_PubMLST + '#MLST:' + options.MLST_profile print( colored( "\t\t + User provided MLST profile: %s" % options.MLST_profile, 'green')) ############### ### get dbs ############### print("\n+ Parsing information to retrieve databases") print("+ Reading from database: " + database2use) HCGB_aes.print_sepLine("-", 50, False) ############### ## debug message if (Debug): print(colored("**DEBUG: option_db: " + option_db + " **", 'yellow')) print( colored( "**DEBUG: option_db_PubMLST : " + option_db_PubMLST + " **", 'yellow')) pd_KMA = database_generator.getdbs("KMA", database2use, option_db, Debug) pd_PubMLST = database_generator.getdbs("MLST", database2use, option_db_PubMLST, Debug) HCGB_aes.print_sepLine("-", 50, False) ## return both dataframes pd_Merge = pd.concat([pd_KMA, pd_PubMLST], sort=True, ignore_index=True) return (pd_Merge)
def run_database(options): ## init time start_time_total = time.time() start_time_partial = start_time_total ## debugging messages global Debug if (options.debug): Debug = True print("[Debug mode: ON]") else: Debug = False ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Database") print("--------- Starting Process ---------") HCGB_time.print_time() kma_bin = set_config.get_exe("kma") ###################################################### ## print further information if requested if (options.help_ARIBA): print("ARIBA databases information:") ariba_caller.help_ARIBA() exit() elif (options.help_BUSCO): BUSCO_caller.print_help_BUSCO() exit() elif (options.help_KMA): species_identification_KMA.help_kma_database() exit() ###################################################### ## create folder ## absolute options.path = os.path.abspath(options.path) HCGB_files.create_folder(options.path) ######### if Debug: print(colored("DEBUG: absolute path folder: " + options.path, 'yellow')) ########## ## NCBI ## ########## ## if any NCBI options provided if any([options.ID_file, options.descendant]): ## create folders NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path) if (options.ID_file): ## get path and check if it is file abs_path_file = os.path.abspath(options.ID_file) if os.path.isfile(abs_path_file): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check NCBI ids provided ---------\n") HCGB_aes.print_sepLine("*", 70, False) ## get file information print("\t+ Obtaining information from file: %s" % abs_path_file) strains2get = HCGB_main.get_data(abs_path_file, ',', '') dataBase_NCBI = database_generator.NCBI_DB( strains2get, NCBI_folder, Debug) ######### if Debug: print(colored("DEBUG: NCBI data provided: ", 'yellow')) print(options.ID_file) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## strains downloaded would be included to a kma index ## Get all entries belonging to this taxon provided if (options.descendant): ######### if Debug: print(colored("DEBUG: NCBI descendant option: ON ", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print( "--------- Check descendant NCBI taxonomy ids provided ---------\n" ) HCGB_aes.print_sepLine("*", 70, False) ## [TODO] dataBase_NCBI = database_generator.NCBI_descendant( options.descendant, NCBI_folder, Debug) ############################################################## ## update KMA database with NCBI information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_NCBI['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db) print('+ Database to update: ', genbank_kma_db) species_identification_KMA.generate_db(list_of_files, 'genbank_KMA', genbank_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ############### ## user_data ## ############### if options.project_folder: ## dataBase_user = pd.DataFrame() ## get absolute path abs_project_folder = os.path.abspath(options.project_folder) if os.path.exists(abs_project_folder): ######### if Debug: print( colored("DEBUG: User provides folder containing project", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print("--------- Check user provided project folder ---------") HCGB_aes.print_sepLine("*", 70, False) dataBase_user = database_user.update_database_user_data( options.path, abs_project_folder, Debug, options) else: print( colored( "ERROR: Folder provided does not exists: %s" % options.project_folder, 'red')) exit() ############################################################## ## update KMA database with user_data information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_user['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) user_kma_db = HCGB_files.create_subfolder('user_data', kma_db) print('+ Database to update: ', user_kma_db) species_identification_KMA.generate_db(list_of_files, 'userData_KMA', user_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ########## ## ARIBA ########## print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check ARIBA parameters provided --------") HCGB_aes.print_sepLine("*", 50, False) if (options.no_ARIBA): print("+ No ARIBA databases would be downloaded...") ######### if Debug: print(colored("DEBUG: No option ARIBA", 'yellow')) else: #functions.print_sepLine("*",50, False) ### ariba list databases ariba_dbs_list = ['CARD', 'VFDB'] if (options.no_def_ARIBA): ariba_dbs_list = options.ariba_dbs else: if (options.ariba_dbs): ariba_dbs_list = ariba_dbs_list + options.ariba_dbs ariba_dbs_list = set(ariba_dbs_list) ######### if Debug: print(colored("DEBUG: Option ARIBA", 'yellow')) print(options.ariba_dbs) ariba_caller.download_ariba_databases(ariba_dbs_list, options.path, Debug, options.threads) ### ariba list databases if (options.ariba_users_fasta): print( "+ Generate ARIBA database for databases provided: prepare fasta and metadata information" ) ######### if Debug: print(colored("DEBUG: Option user ARIBA db", 'yellow')) print(ariba_users_fasta) print(ariba_users_meta) ## [TODO]: ## ariba prepareref fasta and metadata ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######### ## kma ## ######### print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check KMA parameters provided ----------") kma_database = options.path + '/KMA_db' HCGB_files.create_folder(kma_database) ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains ## downloads all "bacterial" genomes from KMA website ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/ print( "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website" ) ## KMA databases to use ## only user dbs if (options.no_def_kma): if (options.kma_dbs): print("+ Only user databases selected will be indexed...") else: print("+ No databases selected.") print(colored("ERROR: Please select a kma database.", 'red')) exit() ## default dbs + user else: kma_dbs = ["bacteria", "plasmids"] ## default dbs + user if (options.kma_dbs): options.kma_dbs = options.kma_dbs + kma_dbs options.kma_dbs = set(options.kma_dbs) else: options.kma_dbs = kma_dbs ######### if Debug: print(colored("DEBUG: options.kma_dbs", 'yellow')) print(options.kma_dbs) ## Get databases for db in options.kma_dbs: print(colored("\n+ " + db, 'yellow')) db_folder = HCGB_files.create_subfolder(db, kma_database) species_identification_KMA.download_kma_database(db_folder, db, Debug) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ########### ## BUSCO ## ########### if (options.BUSCO_dbs): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check BUSCO datasets provided ---------") BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path) ######### if Debug: print(colored("DEBUG: options.BUSCO_dbs", 'yellow')) print(options.BUSCO_dbs) print("+ BUSCO datasets would be downloaded when executed...") #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print("\n*************** Finish *******************\n") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Database module.\n") return ()
def get_userData_info(options, project_folder): ## get information regarding: ## genus, species (ident module) ## card & VFDB (profile module) ## additional information: MGE, etc ## get profile information print() HCGB_aes.print_sepLine("-", 60, 'yellow') print("+ Retrieve virulence/resistance profile information:") pd_samples_profile = sampleParser.files.get_files(options, project_folder, "profile", ["csv"], options.debug) if not pd_samples_profile.empty: pd_samples_profile = pd_samples_profile.set_index('name') HCGB_aes.print_sepLine("-", 60, 'yellow') ## get identification information print() HCGB_aes.print_sepLine("-", 60, 'yellow') print("+ Retrieve species identification information:") pd_samples_ident = sampleParser.files.get_files(options, project_folder, "ident", ["csv"], options.debug) if not pd_samples_ident.empty: pd_samples_ident = pd_samples_ident.set_index('name') HCGB_aes.print_sepLine("-", 60, 'yellow') ## get mash information print() HCGB_aes.print_sepLine("-", 60, 'yellow') print("+ Retrieve cluster information:") pd_samples_mash = sampleParser.files.get_files(options, project_folder, "mash", ["sig"], options.debug) if not pd_samples_mash.empty: pd_samples_mash = pd_samples_mash.set_index('name') HCGB_aes.print_sepLine("-", 60, 'yellow') print() ## add other if necessary ## debug message if (options.debug): print(colored("**DEBUG: pd_samples_profile **", 'yellow')) print(pd_samples_profile) print(colored("**DEBUG: pd_samples_ident **", 'yellow')) print(pd_samples_ident) print(colored("**DEBUG: pd_samples_mash **", 'yellow')) print(pd_samples_mash) ## merge df = pd.concat([pd_samples_profile, pd_samples_ident, pd_samples_mash], join='inner', sort=True).drop_duplicates() ## joining by inner we only get common columns among all ## debug message if (options.debug): print(colored("**DEBUG: pd_concat **", 'yellow')) print(df) ## set new column with name of samples df = df.reset_index() ## rename column df.rename(columns={'index': 'name'}, inplace=True) ## debug message if (options.debug): print(colored("**DEBUG: pd_concat reset_index**", 'yellow')) print(df) ## return (df)
def print_all(): HCGB_aes.print_sepLine("+", 50, False) print('Python:') HCGB_aes.print_sepLine("+", 50, False) print('Python version:', str(sys.version)) print('\n') HCGB_aes.print_sepLine("+", 50, False) print('Python packages:') extern_progs.print_package_version() HCGB_aes.print_sepLine("+", 50, False) print('\n') HCGB_aes.print_sepLine("+", 50, False) print('External dependencies:\n') HCGB_aes.print_sepLine("+", 50, False) extern_progs.print_dependencies() print('\n') print('Additional dependencies: databases, information, etc...') HCGB_aes.print_sepLine("*", 50, False) print("ARIBA databases version..") HCGB_aes.print_sepLine("*", 50, False) print("card -> ") print("megares ->") print("plasmidfinder ->") print("resfinder ->") print("srst2_argannot ->") print("vfdb_core & vfdb_full ->") print("virulencefinder ->") print('\n')
def help_fastq_format(): """ Explanation of fastq format details. """ HCGB_aes.boxymcboxface("Name format for samples") print("Format for fastq files can be:") print( "name.fastq.gz, name_1.fastq.gz, name_R2.fastq.gz, name_L001_R1.fastq.gz, name_L001_R1_001.fastq.gz etc." ) print( "\nThere are many options and here we provide some guidelines on the name format." ) print("\n") HCGB_aes.print_sepLine("*", 20, "red") print("[1] Length limitation") HCGB_aes.print_sepLine("*", 20, "red") print("There is a limitation for the sample ID ('name') of 25 characters.") print( colored( "** BacterialTyper provides an option to rename samples if necessary: module prep option --rename **", 'yellow')) print("\n") HCGB_aes.print_sepLine("*", 20, "red") print("[2] Single end files") HCGB_aes.print_sepLine("*", 20, "red") print( colored( '** Use option --single-end in the different BacterialTyper modules **', 'yellow')) print("name.fastq.gz") print("name.fastq") print("name.fq") print("\n") HCGB_aes.print_sepLine("*", 20, "red") print("[3] Paired-end files") HCGB_aes.print_sepLine("*", 20, "red") print( "Paired-end files are full supported. The format for these files are:") print("Read1 => name_1.fastq.g or name_R1.fastq.gz") print("Read2 => name_2.fastq.gz or name_R2.fastq.gz") print( colored('** See additional details for Lane information **', 'yellow')) print("\n") HCGB_aes.print_sepLine("*", 55, "red") print("[4] Lane information:") HCGB_aes.print_sepLine("*", 55, "red") print( "In some cases, files might contain lane information (*L00x* and/or *00x*)." ) print( "BacterialTyper supports these names as long as follow these examples:" ) print("name_L00x_R1.fastq.gz\tname_L00x_R2.fastq.gz") print("name_L00x_1.fastq.gz\tname_L00x_2.fastq.gz") print("name_L00x_R1.fastq.gz\tname_L00x_R2.fastq.gz") print("name_L00x_R1_00x.fastq.gz\tname_L00x_R2_00x.fastq.gz") print("\n") print( "Sometimes it might be appropriate to include lane tags (*L00X*) within the name." ) print(colored("** Use option --include-lane within each module", 'yellow')) print( colored( "\n** If you need to merge fastq files from different lanes, use option within module prep **", 'yellow')) print("As an example:") print(colored("\n** Option --merge within module prep **", 'yellow')) print("sample1_L001_R1.fastq.gz\tsample1_L001_R2.fastq.gz") print("sample1_L002_R1.fastq.gz\tsample1_L002_R2.fastq.gz") print("sample1_L003_R1.fastq.gz\tsample1_L003_R2.fastq.gz") print("sample1_L004_R1.fastq.gz\tsample1_L004_R2.fastq.gz") print("Result:") print("--------------------------------------------------") print("sample1_R1.fastq.gz\tsample1_R2.fastq.gz") print("\n") print( colored("\n** Option --merge-by-lane within module prep **", 'yellow')) print("sample1_L001_R1_001.fastq.gz\tsample1_L001_R2_001.fastq.gz") print("sample1_L001_R1_002.fastq.gz\tsample1_L001_R2_002.fastq.gz") print("sample1_L002_R1_001.fastq.gz\tsample1_L002_R2_001.fastq.gz") print("sample1_L002_R1_002.fastq.gz\tsample1_L002_R2_002.fastq.gz") print("--------------------------------------------------") print("Result:") print("sample1_L001_R1.fastq.gz\tsample1_L001_R2.fastq.gz") print("sample1_L002_R1.fastq.gz\tsample1_L002_R2.fastq.gz") print( colored("** Remember to use option --include_lane within each module", 'yellow')) print("\n") HCGB_aes.print_sepLine("*", 55, "red") print("[5] Include all information:") HCGB_aes.print_sepLine("*", 55, "red") print( "In some cases, files might contain other information and it is necessay to " + "include it all as a tag nane. See as an example:") print("sample1_L001_XYZ_R1_001.fastq.gz\tsample1_L001_XYZ_R2_001.fastq.gz") print( colored("** Remember to use option --include_all within each module", 'yellow')) print( colored( "** It might be appropiate to change samples names using --rename option under prep module", 'yellow')) print("\n") HCGB_aes.print_sepLine("*", 15, "red") print("[6] Extensions:") HCGB_aes.print_sepLine("*", 15, "red") print( "name_L00x_R2.fastq\tname_L00x_R2.fq\nname_L00x_R2.fastq.gz\tname_L00x_R2.fq.gz" ) print("\n")
def print_all(): print("") HCGB_aes.print_sepLine("+", 50, 'yellow') print("\tSOFTWARE") HCGB_aes.print_sepLine("+", 50, 'yellow') print( "Third party softwares included or employed during the pipeline workflow." ) print("") df_software_citation = pd.DataFrame.from_dict( software_citation(), orient='index', columns=('Article Title', 'Authors', 'PUBMED ID', 'Website')) df_software_citation.index.names = ['Software'] pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print(df_software_citation) print("") HCGB_aes.print_sepLine("+", 50, 'yellow') print("\tDATABASES") HCGB_aes.print_sepLine("+", 50, 'yellow') print("") print("Please cite according to your selection.") print("") HCGB_aes.print_sepLine("+", 50, False) print("\tARIBA databases") HCGB_aes.print_sepLine("*", 50, False) df_ARIBA_DB_citation = pd.DataFrame.from_dict( ariba_citation(), orient='index', columns=('Article Title', 'Authors', 'PUBMED ID', 'Website')) df_ARIBA_DB_citation.index.names = ['Databases'] print(df_ARIBA_DB_citation) print("\n") HCGB_aes.print_sepLine("*", 50, False) print("\tKMA software & databases") HCGB_aes.print_sepLine("*", 50, False) print() print() HCGB_aes.print_sepLine("*", 50, False) print("\tBUSCO software & dataset") HCGB_aes.print_sepLine("*", 50, False) print( "BUSCO applications from quality assessments to gene prediction and phylogenomics." ) print( "Robert M. Waterhouse, Mathieu Seppey, Felipe A. Simão, Mose Manni, Panagiotis " ) print( "Ioannidis, Guennadi Klioutchnikov, Evgenia V. Kriventseva, and Evgeny M. Zdobnov" ) print( "Mol Biol Evol, published online Dec 6, 2017, doi: 10.1093/molbev/msx319" ) print() print( "BUSCO: assessing genome assembly and annotation completeness with single-copy orthologs." ) print( "Felipe A. Simão, Robert M. Waterhouse, Panagiotis Ioannidis, Evgenia " ) print("V. Kriventseva, and Evgeny M. Zdobnov") print( "Bioinformatics, published online June 9, 2015, doi: 10.1093/bioinformatics/btv351" ) print() print( "For further details, please visit: https://busco.ezlab.org/ or https://www.orthodb.org/" ) print() print()
def run(options): """ This is the main function of the module ``config``. It basically checks if the different requirements (python` and third-party software) are fulfilled. If any requirement is not available this modules tries to install them or reports to the user to manually install them. :param option: State whether to check or install missing modules, packages and third party software. Provide: check/install :param install_path: Absolute path to install modules or packages missing. Default: ``BacterialTyper`` environment path. :param IslandPath: True/False for checking additional perl and software required by this option analysis. :param debug: True/false for debugging messages. :type option: string :type IslandPath: boolean :type install_path: string :type debug: boolean .. seealso:: This function depends on several ``BacterialTyper`` functions: - :func:`BacterialTyper.config.set_config.check_python_packages` - :func:`BacterialTyper.config.set_config.check_perl_packages` - :func:`BacterialTyper.config.extern_progs.return_min_version_soft` - :func:`BacterialTyper.config.extern_progs.print_dependencies` """ ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Pipeline Configuration") print("--------- Starting Process ---------") HCGB_time.print_time() if (options.install_path): if os.path.isdir(options.install_path): if (Debug): print( "Installation path provided for missing modules, packages, dependencies..." ) print("Path: " + options.install_path) else: print(colored("\n*** ERROR ****", 'red')) print(colored("Path provided is not a folder", 'red')) print(options.install_path) exit() else: ## get python environment path env_bin_directory = os.path.dirname(os.environ['_']) ##os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'templates')) options.install_path = os.path.abspath( os.path.join(env_bin_directory, '../software')) if (Debug): print("Retrieve environment path as installation path:") print("Path: " + options.install_path) HCGB_files.create_folder(options.install_path) ####################### ## install or only check ####################### option_install = False if (options.option == 'install'): print("\n+ Check dependencies") print( "+ Try to install all missing dependencies, modules or third party software..." ) option_install = True ## check if access and permission if os.path.isdir(options.install_path): if (set_config.access_check(options.install_path, mode=os.F_OK)): print( "Installation path is accessible and has permission for installation if necessary" ) else: print(colored("\n*** ERROR ****", 'red')) print( colored( "No access/permission for this path: %s" % options.install_path, 'red')) print( colored( "Please provide a valid path with access/permission to install any missing dependencies.", 'red')) exit() else: print(colored("\n*** ERROR ****", 'red')) print(colored("Path provided is not a folder", 'red')) print(options.install_path) exit() elif (options.option == 'only_check'): print( "\nCheck dependencies, modules or third party software and print report..." ) ####################### ## python version ####################### HCGB_aes.print_sepLine("+", 20, False) print('Python:') HCGB_aes.print_sepLine("+", 20, False) this_python_version = str(sys.version) python_min_version = extern_progs.return_min_version_soft('python') if LooseVersion(this_python_version) >= LooseVersion(python_min_version): print( colored( "Minimum version (%s) satisfied: %s" % (python_min_version, this_python_version), 'green')) else: print( colored( "Minimum version (%s) not satisfied: %s" % (python_min_version, this_python_version), 'red')) exit() ####################### ## perl_version ####################### print('\n') HCGB_aes.print_sepLine("+", 50, False) print('Perl:') HCGB_aes.print_sepLine("+", 50, False) perl_min_version = extern_progs.return_min_version_soft('perl') this_perl_path = set_config.get_exe("perl", Debug) this_perl_version = set_config.get_version("perl", this_perl_path, Debug) if LooseVersion(this_perl_version) >= LooseVersion(perl_min_version): print( colored( "Minimum version (%s) satisfied: %s" % (perl_min_version, this_perl_version), 'green')) else: print( colored( "Minimum version (%s) not satisfied: %s" % (perl_min_version, this_perl_version), 'red')) exit() ####################### ## third-party software ####################### print('\n') HCGB_aes.print_sepLine("+", 20, False) print('External dependencies:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_dependencies(option_install, options.install_path, Debug) print('\n') ####################### ## python packages ####################### print('\n') HCGB_aes.print_sepLine("+", 20, False) print('Python packages:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_python_packages(Debug, option_install, options.install_path) HCGB_aes.print_sepLine("+", 20, False) print('\n') ####################### ## perl packages ####################### print('\n') HCGB_aes.print_sepLine("+", 20, False) print('Perl packages:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_perl_packages("perl_dependencies", Debug, option_install, options.install_path) HCGB_aes.print_sepLine("+", 20, False) print('\n') ####################### ## IslandPath dependencies ####################### if (options.IslandPath): print('\n') HCGB_aes.print_sepLine("+", 20, False) print('IslandPath packages and software required:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_IslandPath(Debug, option_install, options.install_path) HCGB_aes.print_sepLine("+", 20, False) print('\n') ####################### ## R packages ####################### print('\n') HCGB_aes.print_sepLine("+", 20, False) print('R packages:') HCGB_aes.print_sepLine("+", 20, False) set_config.check_R_packages(option_install, options.install_path, Debug) HCGB_aes.print_sepLine("+", 20, False) print('\n')
def update_database_user_data(database_folder, project_folder, Debug, options): """ Updates user_data folder within the database folder provided. It would generate single subfolder for each sample previously analyzed and it would store main information and result files for later interpretation, comparison and/or summarization with new samples analyzed. :param database_folder: :param project_folder: :param Debug: :param options: :type database_folder: :type project_folder: :type Debug: :type options: :returns: Updated database result from :func:`BacterialTyper.scripts.database_generator.update_db_data_file`. :rtype: Dataframe :warnings: Returns **FAIL** if check process failed. .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.files_functions.create_subfolder` - :func:`HCGB.functions.main_functions.functions.get_data` - :func:`HCGB.functions.main_functions.optimize_threads` - :func:`BacterialTyper.scripts.database_user.get_userData_files` - :func:`BacterialTyper.scripts.database_user.update_sample` - :func:`BacterialTyper.scripts.database_generator.getdbs` - :func:`BacterialTyper.scripts.database_generator.get_database` - :func:`BacterialTyper.scripts.database_generator.update_db_data_file` """ print("\n+ Updating information from user data folder: ", project_folder) ## create folder own_data = HCGB_files.create_subfolder("user_data", database_folder) ## Default missing options options.project = True options.debug = Debug if not options.single_end: options.pair = True #################################### ## get information #################################### ## get user data files project_data_df = get_userData_files(options, project_folder) ## get user data info project_info_df = get_userData_info(options, project_folder) ## merge data project_all_data = pd.concat([project_data_df, project_info_df], join='outer', sort=True).drop_duplicates() #project_all_data.index.name = 'name' ## debug messages: if Debug: HCGB_aes.debug_message("project_data_df", 'yellow') print(project_data_df) HCGB_aes.debug_message("project_info_df", 'yellow') print(project_info_df) HCGB_aes.debug_message("project_all_data", 'yellow') print(project_all_data) print('\n+ Get database information') db_frame = database_generator.getdbs('user_data', database_folder, 'user_data', Debug) user_data_db = database_generator.get_database(db_frame, Debug) ## merge dataframe sample_frame = project_all_data.groupby("name") #################################### ## optimize threads #################################### name_list = project_all_data.index.values.tolist() threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) print('\n+ Updating information using %s threads and %s parallel jobs' % (options.threads, max_workers_int)) #################################### ## loop through frame using multiple threads #################################### with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: ## send for each commandsSent = { executor.submit(update_sample, name, cluster, own_data, user_data_db, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) HCGB_aes.print_sepLine("+", 75, False) print("\n+ Retrieve information ...") #################################### ###### populate dataframe #################################### for name, cluster in sample_frame: ###### dump to file info_file = own_data + '/' + name + '/info.txt' if os.path.exists(info_file): dataGot = HCGB_main.get_data(info_file, ',', 'index_col=0') dataGot = dataGot.set_index('ID') if (options.debug): print(colored("**DEBUG: dataGot dataframe **", 'yellow')) print(dataGot) user_data_db = pd.concat([user_data_db, dataGot], join='outer', sort=True).drop_duplicates() ## concatenating by outer we get all available entries if (options.debug): print(colored("**DEBUG: user_data_db dataframe **", 'yellow')) print(user_data_db) HCGB_aes.print_sepLine("+", 75, False) #################################### ## update db #################################### database_csv = own_data + '/user_database.csv' dataUpdated = database_generator.update_db_data_file( user_data_db, database_csv) print("+ Database has been generated: \n", database_csv) return (dataUpdated)
def NCBI_DB(strains2get, data_folder, Debug): """Donwloads given taxa from NCBI if not available and updates database information. This function checks in the given folder if strain of interest is available. If not it would connect to NCBI using python module ncbi_genome_download and downloads some information. :param strains2get: dataframe containing genus, species and NCBI assembly columns among others. See example below. :param data_folder: Absolute path to database NCBI folder. :param Debug: Print messages for debugging purposes if desired. :type strains2get: dataframe :type data_folder: string :type Debug: bool :return: Dataframe of genbank database updated for all available entries. Columns for the dataframe :file:`strains2get` consist of: sample,genus,species,strain,BioSample,genome,Plasmids See and example in file: :file:`/devel/results/strains2get_NCBI_DB.csv` and shown here: .. include:: ../../devel/results/strains2get_NCBI_DB.csv :literal: See example of the return dataframe, containing database information updated in file: :file:`/devel/results/genbank_database.csv` here: .. include:: ../../devel/results/genbank_database.csv :literal: .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.file_funtcions.create_folder` - :func:`HCGB.functions.main_functions.get_data` - :func:`BacterialTyper.scripts.database_generator.get_dbs` - :func:`BacterialTyper.scripts.database_generator.get_database` - :func:`BacterialTyper.scripts.database_generator.NCBIdownload` - :func:`BacterialTyper.scripts.database_generator.update_db_data_file` .. include:: ../../links.inc """ ## set index strains2get = strains2get.set_index( 'NCBI_assembly_ID', drop=False) ## set new index but keep column strains2get.index.names = ['ID'] ## rename index strains2get = strains2get.drop_duplicates() ######### if Debug: print(colored("DEBUG: NCBI data provided: ", 'yellow')) print(strains2get) ## get data existing database print("+ Create the database in folder: \n", data_folder) HCGB_files.create_folder(data_folder) ## read database db_frame = getdbs('NCBI', data_folder, 'genbank', Debug) database_df = get_database(db_frame, Debug) ######### if Debug: print(colored("DEBUG: NCBI genbank database retrieved: ", 'yellow')) print("db_frame") print(db_frame) print() print("database_df") print(database_df) ## loop and download for index, row in strains2get.iterrows(): HCGB_aes.print_sepLine("+", 75, False) acc_ID = index #strains2get.loc[index]['NCBI_assembly_ID'] info = "Genus: " + strains2get.loc[index][ 'genus'] + '\n' + "Species: " + strains2get.loc[index][ 'species'] + '\n' + "Strain: " + strains2get.loc[index][ 'name'] + '\n' + "ID accession: " + acc_ID + '\n' dir_path = data_folder + '/genbank/bacteria/' + acc_ID ## module ngd requires to download data in bacteria subfolder under genbank folder ## check if already exists if acc_ID in database_df.index: print("\n+ Data is already available in database for: ") print(colored(info, 'green')) else: ## download print("\n+ Downloading data for:") print(colored(info, 'green')) data_accID = NCBIdownload(acc_ID, strains2get, data_folder) this_db = HCGB_main.get_data(data_accID, ',', 'index_col=0') this_db = this_db.set_index('ID') database_df = database_df.append(this_db) ## Generate/Update database database_csv = data_folder + '/genbank_database.csv' db_updated = update_db_data_file(database_df, database_csv) print("+ Database has been generated in file: ", database_csv) return (db_updated)
def parse_options(arg_dict): outdir = os.path.abspath(arg_dict.output_folder) ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options ## ATTENTION: df_accID merge generated dataframe ## --------------------------------------- ## ## GFF or GBF file ## --------------------------------------- ## if (arg_dict.annot_file): arg_dict.annot_file = os.path.abspath(arg_dict.annot_file) # *************************** ## ## multiple files provided # *************************** ## if (arg_dict.batch): ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Multiple annotation file provided option:', 'yellow') debug_message('arg_dict.annot_file: ' + arg_dict.annot_file, 'yellow') ## check if ok BacDup_functions.file_readable_check(arg_dict.annot_file) print( colored('\t* Multiple annotation files provided .......[OK]', 'green')) dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',') ## debug messages if (arg_dict.debug): debug_message('dict_entries: ', 'yellow') debug_message(dict_entries, 'yellow') debug_message('+++++++++++++++++++++++++++++++\n\n') # *************************** ## ## single file provided # *************************** ## else: dict_entries = {} print(colored('\t* Annotation file:.......[OK]', 'green')) if (arg_dict.sample_name): sample_name = arg_dict.sample_name else: sample_name = "sample" ## dict_entries[sample_name] = arg_dict.annot_file ## create dataframe df_accID to match other formats df_accID = pd.DataFrame( columns=(BacDup_functions.columns_accID_table())) for name, file_annot in dict_entries.items(): file_annot = os.path.abspath(file_annot) ## init all genome = "" prot = "" gff = "" gbk = "" plasmid_count = "" plasmid_id = "" ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message( 'dict_entries check annotation files provided option:', 'yellow') debug_message('name: ' + name, 'yellow') debug_message('file_annot: ' + file_annot, 'yellow') ## check file is valid BacDup_functions.file_readable_check(file_annot) ## get format format = format_checker.is_format(file_annot, arg_dict.debug) if (arg_dict.debug): debug_message('format: ' + format, 'yellow') ## parse accordingly taxonomy = "" organism = "" taxonomy_string = "" genus = "" if (format == 'gbk'): ## get information from each sample (taxonomy, organism) = BacDup.scripts.functions.get_gbk_information( file_annot, arg_dict.debug) ## plasmid_count, plasmid_id not available elif (format == 'gff'): if (arg_dict.ref_file): arg_dict.ref_file = os.path.abspath(arg_dict.ref_file) BacDup_functions.file_readable_check(arg_dict.ref_file) if (arg_dict.batch): ref_entries = HCGB_main.file2dictionary( arg_dict.ref_file, ',') genome = ref_entries[name] else: genome = arg_dict.ref_file ## save into dataframe if len(taxonomy) > 1: genus = taxonomy[-1] taxonomy_string = ";".join(taxonomy) dir_path = os.path.abspath(os.path.dirname(file_annot)) df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism, taxonomy_string, genome, file_annot, format, prot, plasmid_count, ";".join(plasmid_id)) ## --------------------------------------- ## ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1 ## --------------------------------------- ## elif (arg_dict.GenBank_id): ## get database path if (arg_dict.db_folder): db_folder = HCGB_files.create_folder( os.path.abspath(arg_dict.db_folder)) else: db_folder = HCGB_files.create_subfolder( "db", os.path.abspath(arg_dict.output_folder)) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('GenBank ID option:', 'yellow') debug_message('db_folder: ' + db_folder, 'yellow') # *************************** ## ## batch file # *************************** ## if (arg_dict.batch): arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id) ## debug messages if (arg_dict.debug): debug_message('GenBank ID batch file provided:', 'yellow') debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id, 'yellow') ## check is a file and readable BacDup_functions.file_readable_check(arg_dict.GenBank_id) print( colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]', 'green')) print() ## call IDs into a list and create tmp folder strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id) strains2get = list(filter(None, strains2get)) ## debug messages if (arg_dict.debug): debug_message('strains2get: ' + str(strains2get), 'yellow') ## call NCBI_downloader df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list( strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level) # *************************** ## ## single GenBank ID # *************************** ## else: ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Single NCBI GenBank IDs provided option:', 'yellow') debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id, 'yellow') debug_message('db_folder: ' + db_folder, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## download print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green')) print() HCGB_aes.print_sepLine("+", 75, False) df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload( arg_dict.GenBank_id, db_folder, arg_dict.debug) ## --------------------------------------- ## ## NCBI Taxonomy ID: ## --------------------------------------- ## elif (arg_dict.tax_id): ################# ## get tax ids ################# if (arg_dict.batch): print( colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]', 'green')) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Multiple NCBI Taxonomy IDs provided option:', 'yellow') ## check is a file and readable BacDup_functions.file_readable_check(arg_dict.tax_id) ## get IDs into a list taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id) else: print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green')) taxIDs2get = [arg_dict.tax_id] print() ################################## ## init ete NCBI taxonomy database ################################## print('+ Initiate NCBI taxonomy database...') ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug) string_info_total = [] for taxid in taxIDs2get: ## parse info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel', arg_dict.debug) print() ## debug messages if arg_dict.debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) debug_message('info\n', "yellow") print(info) ## append if more string_info_total.extend(info) ## convert to list of strings string_info_total = [str(int) for int in string_info_total] ## assume all belong to same superkingdom if children of same tax_id group_obtained = taxonomy_retrieval.get_superKingdom( string_info_total[0], ncbi, arg_dict.debug) ################# ## get database path ################# if (arg_dict.db_folder): db_folder = HCGB_files.create_folder( os.path.abspath(arg_dict.db_folder)) else: db_folder = HCGB_files.create_subfolder("db", outdir) ## debug messages if arg_dict.debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) debug_message('group_obtained: ' + group_obtained, "yellow") debug_message('db_folder: ' + db_folder, "yellow") debug_message( 'arg_dict.assembly_level: ' + arg_dict.assembly_level, "yellow") debug_message('arg_dict.section: ' + arg_dict.section, "yellow") ################################## ## get GenBank entries selected ################################## (strains2get, allstrains_available) = taxonomy_retrieval.get_GenBank_ids( db_folder, string_info_total, int(arg_dict.k_random), arg_dict.debug, assembly_level_given=arg_dict.assembly_level, group_given=group_obtained, section_given=arg_dict.section) ## print list and dictionary of possible and selected taxIDs outdir = os.path.abspath(arg_dict.output_folder) info_dir = HCGB_files.create_subfolder("info", outdir) input_info_dir = HCGB_files.create_subfolder("input", info_dir) HCGB_main.printList2file( os.path.join(input_info_dir, 'Downloaded.txt'), strains2get) HCGB_main.printList2file( os.path.join(input_info_dir, 'all_entries.txt'), allstrains_available) ## save into file file_info = os.path.join(input_info_dir, 'info.txt') ## stop here if dry_run if arg_dict.dry_run: print() HCGB_aes.print_sepLine("*", 75, False) print( "ATTENTION: Dry run mode selected. Stopping the process here.") HCGB_aes.print_sepLine("*", 75, False) print("+ All available entries listed and printed in file:\n\t" + os.path.join(input_info_dir, 'all_entries.txt')) print("+ Subset of entries generated and printed in file:\n\t" + os.path.join(input_info_dir, 'Downloaded.txt')) print( "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n" ) HCGB_aes.print_sepLine("*", 75, False) print() exit() ################# ## call NCBI_downloader ################# df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list( strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level) ## --------------------------------------- ## ## Previous BacDup analysis folder ## --------------------------------------- ## ## TODO elif (arg_dict.project): print( colored( '\t* A previous BacDup analysis project folder:.......[OK]', 'green')) ## create df_accID to store data ## TODO ## Returns dataframe with information df_accID = df_accID.set_index('new_name') return (df_accID)
def ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases, start_time_partial): HCGB_aes.boxymcboxface("ARIBA Identification") ################## ## check status ## ################## databases2use = [] ## path, db name card_trick_info = "" print('+ Check databases status: ') for index, db2use in retrieve_databases.iterrows(): ## index_name if (db2use['source'] == 'ARIBA'): index_status = ariba_caller.check_db_indexed(db2use['path'], 'YES') if (index_status == True): #print (colored("\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green')) databases2use.append([db2use['path'], db2use['db']]) ## prepare card database ontology for later if (db2use['db'] == 'card'): card_trick_info = card_trick_caller.prepare_card_data( options.database) ## check status of other databases if any # else: ## debug message if (Debug): print(colored("**DEBUG: databases2use\n**", 'yellow')) print(databases2use) if (card_trick_info): print( colored("**DEBUG: card_trick_info: " + card_trick_info + " **", 'yellow')) ###################################################### ## Start identification of samples ###################################################### print("\n+ Send ARIBA identification jobs...") ## get outdir folders outdir_samples = pd.DataFrame(columns=('sample', 'dirname', 'db', 'output')) # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) for name, cluster in sample_frame: for db2use in databases2use: tmp = get_outfile(outdir_dict[name], name, db2use[0]) outdir_samples.loc[len(outdir_samples)] = (name, outdir_dict[name], db2use[1], tmp) ## multi-index outdir_samples = outdir_samples.set_index(['sample', 'db']) ## debug message if (Debug): print(colored("**DEBUG: outdir_samples **", 'yellow')) print(outdir_samples) ###################################################### ## send for each sample ###################################################### ## ariba assembly cutoff if not (options.ARIBA_cutoff): options.ARIBA_cutoff = 0.90 ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## loop results_df = pd.DataFrame() with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: for db2use in databases2use: print(colored("+ Working with database: " + db2use[1], 'yellow')) ## send for each sample commandsSent = { executor.submit( ariba_run_caller, db2use[0], db2use[1], ## database path & dbname sorted(cluster["sample"].tolist()), ## files outdir_samples.loc[(name, db2use[1]), 'output'], ## output threads_job, options.ARIBA_cutoff): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) print("+ Jobs finished for database %s ..." % db2use[1]) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print() print( "+ Collecting information for each sample analyzed for database: " + db2use[1]) ## check results for each database results_df_tmp = virulence_resistance.check_results( db2use[1], outdir_samples, options.ARIBA_cutoff, card_trick_info) results_df = pd.concat([results_df, results_df_tmp]) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ###################################################### ## Generate final report for all samples ###################################################### ## ariba summary results all samples print( "\n + Generate a summary file for all samples and one for each database employed..." ) ## parse results if Project: final_dir = input_dir + '/report/profile' HCGB_files.create_folder(final_dir) else: final_dir = os.path.abspath(options.output_folder) ## vfdb = False subfolder = HCGB_files.create_subfolder("ariba_summary", final_dir) ## subfolder_samples = functions.create_subfolder("samples", final_dir) ## TODO: Copy all xlsx files to a common folder. Is it necessary? ## open excel writer name_excel = final_dir + '/profile_summary.xlsx' writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') for database, data in outdir_samples.groupby(level='db'): ## fix report_files_databases = {} for sample, data2 in data.groupby(level='sample'): ## fix file_report = data2.loc[sample, database]['output'] + '/report.tsv' if os.path.isfile(file_report): ## check if exists report_files_databases[sample] = file_report outfile_summary = subfolder + "/" if database.endswith('card_prepareref/'): outfile_summary = outfile_summary + 'CARD_summary' name_db = 'CARD' elif database.endswith('vfdb_full_prepareref/'): outfile_summary = outfile_summary + 'VFDB_summary' name_db = 'VFDB' vfdb = True else: ## TODO: check if there are multiple 'other' databases ## Different databases provided (different to VFDB and CARD) would collapse file outfile_summary = outfile_summary + 'Other_summary' name_db = 'other' ## call ariba summary to summarize results csv_all = ariba_caller.ariba_summary_all(outfile_summary, report_files_databases) if not csv_all == 'NaN': csv2excel = pd.read_csv(csv_all, header=0, sep=',') ## write excel name_tab = name_db + '_found' csv2excel.to_excel(writer, sheet_name=name_tab) ## results_df contains excel and csv files for each sample and for each database list_databases = set(results_df['database'].to_list()) for db in list_databases: df_db = results_df[results_df['database'] == db]['csv'] dict_samples = df_db.to_dict() merge_df = pd.DataFrame() for sample in dict_samples: if os.path.isfile(dict_samples[sample]): df = pd.read_csv(dict_samples[sample], header=0, sep=",") df = df.set_index('Genes') df2 = df.rename(columns={'Status': sample}, inplace=True) df2 = df[[sample]] ## add to a common dataframe merge_df = pd.concat([merge_df, df2], axis=1, sort=True) merge_df.fillna("NaN", inplace=True) trans_df = merge_df.transpose() ## write excel name_tab = db + '_all' trans_df.to_excel(writer, sheet_name=name_tab) ## close writer.save() ###################################################### ## print additional information for VFDB ###################################################### if (vfdb): print("\n\n") HCGB_aes.print_sepLine("*", 50, False) print("+ Check VFDB details in files downloaded from vfdb website:") files_VFDB = virulence_resistance.check_VFDB(final_dir + '/VFDB_information') HCGB_aes.print_sepLine("*", 50, False) ###################################################### print("\n+ Please check additional summary files generated at folder ", final_dir) print("+ Go to website: https://jameshadfield.github.io/phandango/#/") print( "+ For each database upload files *phandango.csv and *phandango.tre and visualize results" )
def get_userData_files(options, project_folder): ## get information regarding files ## get trimmed ngs files print() HCGB_aes.print_sepLine("-", 60, 'yellow') print("+ Retrieve trimmed reads information:") pd_samples_reads = sampleParser.files.get_files(options, project_folder, "trim", ['_trim'], options.debug) pd_samples_reads = pd_samples_reads.set_index('name') HCGB_aes.print_sepLine("-", 60, 'yellow') ## get assembly files print() HCGB_aes.print_sepLine("-", 60, 'yellow') print("+ Retrieve assembly information:") pd_samples_assembly = sampleParser.files.get_files(options, project_folder, "assembly", ["fna"], options.debug) pd_samples_assembly = pd_samples_assembly.set_index('name') HCGB_aes.print_sepLine("-", 60, 'yellow') ## get annotation files print() HCGB_aes.print_sepLine("-", 60, 'yellow') print("+ Retrieve annotation information:") pd_samples_annot = sampleParser.files.get_files(options, project_folder, "annot", ['gbf', 'faa', 'gff'], options.debug) pd_samples_annot = pd_samples_annot.set_index('name') HCGB_aes.print_sepLine("-", 60, 'yellow') ## debug message if (options.debug): print(colored("**DEBUG: pd_samples_reads **", 'yellow')) print(pd_samples_reads) print(colored("**DEBUG: pd_samples_assembly **", 'yellow')) print(pd_samples_assembly) print(colored("**DEBUG: pd_samples_annot **", 'yellow')) print(pd_samples_annot) ## merge df = pd.concat([pd_samples_reads, pd_samples_annot, pd_samples_assembly], sort=True, join='inner').drop_duplicates() ## joining by inner we only get common columns among all ## debug message if (options.debug): print(colored("**DEBUG: pd_concat **", 'yellow')) print(df) ## set new column with name of samples df = df.reset_index() ## debug message if (options.debug): print(colored("**DEBUG: pd_concat reset_index**", 'yellow')) print(df) ## return (df)