def get_database(db_frame, Debug): data4db = pd.DataFrame() for index, row in db_frame.iterrows(): ## information this_file = db_frame.loc[index]['path'] + '/info.txt' if os.path.isfile(this_file): print('+ Reading information for sample: ', db_frame.loc[index]['db']) print( colored("\t+ Obtaining information from file: %s" % this_file, 'yellow')) this_db = HCGB_main.get_data(this_file, ',', 'index_col=0') data4db = data4db.append(this_db) timestamp = db_frame.loc[index]['path'] + '/.success' if os.path.isfile(timestamp): stamp = HCGB_time.read_time_stamp(timestamp) print(colored("\t+ Data generated on: %s" % stamp, 'yellow')) HCGB_aes.print_sepLine("*", 25, False) ## index by ID if not data4db.empty: data4db = data4db.set_index('ID') return (data4db)
def get_perl_packages(Debug, file_name): """ Retrieves the version of the perl packages installed in the system. It retrieves the dependencies name conversion from file :file:`BacterialTyper/config/perl/perl_dependencies.csv` using function :func:`BacterialTyper.config.extern_progs.file_list` and :func:`BacterialTyper.scripts.functions.get_data`. For each module it retrieves the package version installed in the system using :func:`BacterialTyper.config.set_config.check_perl_package_version`. :returns: Dictionary containing for each perl module (key) the installed version (value). .. seealso:: This function relies on other ``BacterialTyper`` functions: - :func:`BacterialTyper.config.set_config.check_perl_package_version` - :func:`BacterialTyper.config.extern_progs.file_list` - :func:`BacterialTyper.scripts.functions.get_data` """ ## get info for perl modules perl_lib_dependecies_file = extern_progs.file_list(file_name) perl_lib_dependecies = HCGB_main.get_data(perl_lib_dependecies_file, ',', 'index_col=0') my_packages_installed = {} for index_name, row in perl_lib_dependecies.iterrows(): module_name = row['module'] installed = check_perl_package_version( module_name, Debug) ## check version installed in system if not (installed): installed = 'n.a.' my_packages_installed[index_name] = installed return (my_packages_installed)
def get_info_software(): """Read software information Reads information stored in file :file:`BacterialTyper.config.software.software_details.csv` and returns pandas data frame. """ info_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), 'software', 'software_details.csv')) return(HCGB_main.get_data(info_file, ',', 'index_col=0'))
def update_db_data_file(data, csv): if os.path.isfile(csv): print("\n+ Updating database") print("+ Obtaining information from database file: %s" % csv) db2update = HCGB_main.get_data(csv, ',', 'index_col=0') ## TODO: provide preference to db2update df = pd.concat([db2update, data], join='inner', sort=True).drop_duplicates() df.to_csv(csv) return (df) else: data.to_csv(csv) return (data)
def retrieve_genes_ids_profile(profile, gene_ID, debug, option): """ """ ## read data get_csv_data = HCGB_main.get_data(profile, ',', '') if option == 'name': list_Genes = get_csv_data['Genes'].to_list() get_csv_data.index = get_csv_data['Genes'] elif option == 'ID': list_Genes = get_csv_data['ID'].to_list() get_csv_data.index = get_csv_data['ID'] ## debug messages if debug: HCGB_aes.debug_message('profile: ' + profile, 'yellow') HCGB_aes.debug_message('gene_id: ' + str(gene_ID), 'yellow') HCGB_aes.debug_message('data: ', 'yellow') print(get_csv_data) HCGB_aes.debug_message('Option: ' + option, 'yellow') HCGB_aes.debug_message('Genes: ', 'yellow') print (list_Genes) ## search accordingly if option == 'name': regex_search = re.compile("^" + gene_ID + ".*") filtered_genes = list(filter(regex_search.match, list_Genes)) ## debug messages if debug: HCGB_aes.debug_message('filtered_genes: ', 'yellow') print (filtered_genes) HCGB_aes.debug_message('filtered_genes.loc[filtered_genes]: ', 'yellow') print (get_csv_data.loc[filtered_genes]) return (get_csv_data.loc[filtered_genes]) else: if gene_ID in list_Genes: ## debug messages if debug: HCGB_aes.debug_message('gene_id: ' + gene_ID, 'yellow') print (get_csv_data.loc[gene_ID].to_frame().transpose()) return (get_csv_data.loc[gene_ID].to_frame().transpose()) else: return(pd.DataFrame())
def busco_datasets(): """BUSCO dataset information :return: List containing information for each dataset available in file BUSCO_dataset.csv under data directory. .. seealso:: Additional information on BUSCO datasets available. - :doc:`BUSCO datasets <../../../data/BUSCO_datasets>` .. seealso:: This function depends on other BacterialTyper functions called: - :func:`BacterialTyper.data.data_files.data_list` - :func:`BacterialTyper.scripts.functions.file2dataframe` """ ## read from file: BUSCO_dataset.csv BUSCO_dataset_file = data.data_files.data_list("BUSCO_dataset") busco_data = HCGB_main.get_data(BUSCO_dataset_file, ",", options="") return(busco_data)
def agrvate_call(sample, assembly_file, folder, debug=False): """agrvate call and check results.""" ## prepare call log_call = os.path.join(folder, "agrvate_cmd.log") err_call = os.path.join(folder, "agrvate_cmd.err") agrvate_bin = set_config.get_exe('agrvate') ## system call cmd_call = "%s -i %s -m -f > %s 2> %s " %(agrvate_bin, assembly_file, log_call, err_call) ## use mummer (-m) and force results folder (-f) status = HCGB_sys.system_call(cmd_call) ## check results ## see https://github.com/VishnuRaghuram94/AgrVATE#results for additional details results = pd.DataFrame() ## check folder is created assembly_file_name = os.path.basename(assembly_file).split('.fna')[0] original_results_folder = os.path.join(folder, assembly_file_name + '-results') results_folder = os.path.join(folder, 'agrvate_results') if os.path.isdir(original_results_folder): print("+ Results folder generated OK") print("+ Check results generated:") ## rename folder os.rename(original_results_folder, results_folder) os.rename(os.path.join(folder, assembly_file_name + '.fna-error-report.tab'), os.path.join(results_folder, 'error_report.tab')) ## write to excel file_name_Excel = os.path.join(folder, sample + '_agr_results.xlsx') writer_Excel = pd.ExcelWriter(file_name_Excel, engine='xlsxwriter') ## open excel handle ## get all files list_files = HCGB_main.get_fullpath_list(results_folder) ## summary tab summary_tab_file = [s for s in list_files if s.endswith("summary.tab")][0] summary_tab = HCGB_main.get_data(summary_tab_file, '\t', options="") summary_tab['sample'] = sample ## columns #agr_group: gp1/gp2/gp3/gp4. 'u' means unknown. ## If multiple agr groups were found (col 5 = m), ## the displayed agr group is the majority/highest confidence. # match_score: maximum 15; 0 means untypeable; < 5 means low confidence. # canonical_agrD: 1 means canonical; 0 means non-canonical; u means unknown. # multiple_agr: s means single, m means multiple, u means unknown ) ## Multiple groups are found likely due to multiple S. aureus isolates in sequence # frameshifts: Number found in CDS of extracted agr operon ('u' if agr operon not extracted) ## debug messages if debug: HCGB_aes.debug_message("agrvate results: Summary tab file", 'yellow') print(summary_tab_file) print(summary_tab) ## add summary results to all results del summary_tab['#filename'] results = summary_tab.copy() ## save summary_tab into excel ## tab summary summary_tab.to_excel(writer_Excel, sheet_name='summary') ## write excel handle ## agr_gp tab agr_gp_tab_file = [s for s in list_files if s.endswith("agr_gp.tab")][0] if HCGB_files.is_non_zero_file(agr_gp_tab_file): agr_gp_tab = HCGB_main.get_data(agr_gp_tab_file, '\t', options='header=None') agr_gp_tab.columns = ['contig', 'agr', 'evalue', 'identity', 'start', 'end'] agr_gp_tab['sample'] = sample ## columns ## Assembly Contig ID ## ID of matched agr group kmer ## evalue ## Percentage identity of match ## Start position of kmer alignment on input sequence ## End position of kmer alignment on input sequence ## debug messages if debug: HCGB_aes.debug_message("agrvate results: agr_gp file", 'yellow') print(agr_gp_tab_file) print(agr_gp_tab) ## save agr_gp_tab file into excel ## tab operon agr_gp_tab.to_excel(writer_Excel, sheet_name='operon') ## write excel handle ## agr_operon fna try: agr_operon_fna_file = [s for s in list_files if s.endswith("agr_operon.fna")][0] ## debug messages if debug: HCGB_aes.debug_message("agrvate results: agr_operon file", 'yellow') print(agr_operon_fna_file) results['operon_fna'] = agr_operon_fna_file except: results['operon_fna'] = '' ## agr_operon fna error_report_file = [s for s in list_files if s.endswith("error_report.tab")][0] error_report = HCGB_main.get_data(error_report_file, '\t', options="") del error_report['#input_name'] ## debug messages if debug: HCGB_aes.debug_message("agrvate results: error_report.tab file", 'yellow') print(error_report_file) print(error_report) ## save error_report file into excel ## tab steps error_report.to_excel(writer_Excel, sheet_name='steps') ## write excel handle ## merge results results = pd.concat([results, error_report], axis=1) ## close xlsx file writer_Excel.save() ## close excel handle ## add to pandas dataframe results['agr_operon_xlsx'] = file_name_Excel ## debug messages if debug: HCGB_aes.debug_message("agrvate results", 'yellow') HCGB_main.print_all_pandaDF(results) return (results)
def biotype_all(featureCount_exe, path, gtf_file, bam_file, name, threads, Debug, allow_multimap, stranded): ## folder for results if not os.path.isdir(path): files_functions.create_folder(path) out_file = os.path.join(path, 'featureCount.out') logfile = os.path.join(path, name + '_RNAbiotype.log') filename_stamp_all = path + '/.success_all' if os.path.isfile(filename_stamp_all): stamp = time_functions.read_time_stamp(filename_stamp_all) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'RNAbiotype'), 'yellow')) return() else: filename_stamp_featureCounts = path + '/.success_featureCounts' if os.path.isfile(filename_stamp_featureCounts): stamp = time_functions.read_time_stamp(filename_stamp_featureCounts) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'featureCounts'), 'yellow')) else: ## debugging messages if Debug: print ("** DEBUG:") print ("featureCounts system call for sample: " + name) print ("out_file: " + out_file) print ("logfile: " + logfile) ## send command for feature count ## Allow multimapping if allow_multimap: cmd_featureCount = ('%s -s %s -M -O -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %( featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile) ) else: cmd_featureCount = ('%s -s %s --largestOverlap -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %( featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile) ) ## system call cmd_featureCount_code = system_call_functions.system_call(cmd_featureCount, False, True) if not cmd_featureCount_code: print("** ERROR: featureCount failed for sample " + name) exit() ## print time stamp time_functions.print_time_stamp(filename_stamp_featureCounts) ## parse results (extended_Stats_file, RNAbiotypes_stats_file) = parse_featureCount(out_file, path, name, bam_file, Debug) ## debugging messages if Debug: print ("** DEBUG:") print ("extended_Stats: " + extended_Stats_file) print (main_functions.get_data(extended_Stats_file, '\t', 'header=None')) print ("RNAbiotypes_stats: " + RNAbiotypes_stats_file) print (main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None')) return ()
def pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug): ## filename_stamp_plot = folder + '/.success_plot' if os.path.isfile(filename_stamp_plot): stamp = time_functions.read_time_stamp(filename_stamp_plot) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'plot results'), 'yellow')) else: # PLOT and SHOW results RNAbiotypes_stats = main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None') # create plot plt.figure(figsize=(16,8)) df_genetype_2 = pd.DataFrame({'Type':RNAbiotypes_stats[0], 'Count':RNAbiotypes_stats[1]}).sort_values(by=['Count']) ## get total count df_genetype_ReadCount_sum = df_genetype_2['Count'].sum() ## filter 1% values minimun = df_genetype_ReadCount_sum * 0.01 df_genetype_filter_greater = df_genetype_2[ df_genetype_2['Count'] >= minimun ] df_genetype_filter_smaller = df_genetype_2[ df_genetype_2['Count'] < minimun ] ## create %values df_genetype_2['Percentage'] = (df_genetype_2['Count']/df_genetype_ReadCount_sum*100).round(3) ## merge and generate Other class df_genetype_filter_smaller_sum = df_genetype_filter_smaller['Count'].sum() ## total filter smaller df_genetype_filter_greater2 = df_genetype_filter_greater.append({ 'Count':df_genetype_filter_smaller_sum, 'Type':'Other'}, ignore_index=True) ## Create Pie Plot ax1 = plt.subplot(121, aspect='equal') df_genetype_filter_greater2.plot.pie( y = 'Count', ax=ax1, autopct='%1.2f%%', shadow=False, labels=df_genetype_filter_greater2['Type'], legend = False) # plot table ax2 = plt.subplot(122) plt.axis('off') tbl = ax2.table( cellText=df_genetype_2.values, colLabels=df_genetype_2.columns, loc='center', rowLoc='left', cellLoc='center', ) tbl.auto_set_font_size(True) #tbl.set_fontsize(12) tbl.scale(1.1,1.1) ## set PDF name name_figure = os.path.join(folder, name + '_RNAbiotypes.pdf') ## generate image plt.savefig(name_figure) plt.close(name_figure) ## print time stamps time_functions.print_time_stamp(filename_stamp_plot) filename_stamp_all = folder + '/.success_all' time_functions.print_time_stamp(filename_stamp_all)
def NCBI_DB(strains2get, data_folder, Debug): """Donwloads given taxa from NCBI if not available and updates database information. This function checks in the given folder if strain of interest is available. If not it would connect to NCBI using python module ncbi_genome_download and downloads some information. :param strains2get: dataframe containing genus, species and NCBI assembly columns among others. See example below. :param data_folder: Absolute path to database NCBI folder. :param Debug: Print messages for debugging purposes if desired. :type strains2get: dataframe :type data_folder: string :type Debug: bool :return: Dataframe of genbank database updated for all available entries. Columns for the dataframe :file:`strains2get` consist of: sample,genus,species,strain,BioSample,genome,Plasmids See and example in file: :file:`/devel/results/strains2get_NCBI_DB.csv` and shown here: .. include:: ../../devel/results/strains2get_NCBI_DB.csv :literal: See example of the return dataframe, containing database information updated in file: :file:`/devel/results/genbank_database.csv` here: .. include:: ../../devel/results/genbank_database.csv :literal: .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.file_funtcions.create_folder` - :func:`HCGB.functions.main_functions.get_data` - :func:`BacterialTyper.scripts.database_generator.get_dbs` - :func:`BacterialTyper.scripts.database_generator.get_database` - :func:`BacterialTyper.scripts.database_generator.NCBIdownload` - :func:`BacterialTyper.scripts.database_generator.update_db_data_file` .. include:: ../../links.inc """ ## set index strains2get = strains2get.set_index( 'NCBI_assembly_ID', drop=False) ## set new index but keep column strains2get.index.names = ['ID'] ## rename index strains2get = strains2get.drop_duplicates() ######### if Debug: print(colored("DEBUG: NCBI data provided: ", 'yellow')) print(strains2get) ## get data existing database print("+ Create the database in folder: \n", data_folder) HCGB_files.create_folder(data_folder) ## read database db_frame = getdbs('NCBI', data_folder, 'genbank', Debug) database_df = get_database(db_frame, Debug) ######### if Debug: print(colored("DEBUG: NCBI genbank database retrieved: ", 'yellow')) print("db_frame") print(db_frame) print() print("database_df") print(database_df) ## loop and download for index, row in strains2get.iterrows(): HCGB_aes.print_sepLine("+", 75, False) acc_ID = index #strains2get.loc[index]['NCBI_assembly_ID'] info = "Genus: " + strains2get.loc[index][ 'genus'] + '\n' + "Species: " + strains2get.loc[index][ 'species'] + '\n' + "Strain: " + strains2get.loc[index][ 'name'] + '\n' + "ID accession: " + acc_ID + '\n' dir_path = data_folder + '/genbank/bacteria/' + acc_ID ## module ngd requires to download data in bacteria subfolder under genbank folder ## check if already exists if acc_ID in database_df.index: print("\n+ Data is already available in database for: ") print(colored(info, 'green')) else: ## download print("\n+ Downloading data for:") print(colored(info, 'green')) data_accID = NCBIdownload(acc_ID, strains2get, data_folder) this_db = HCGB_main.get_data(data_accID, ',', 'index_col=0') this_db = this_db.set_index('ID') database_df = database_df.append(this_db) ## Generate/Update database database_csv = data_folder + '/genbank_database.csv' db_updated = update_db_data_file(database_df, database_csv) print("+ Database has been generated in file: ", database_csv) return (db_updated)
def run_search(arg_dict): """Main function of the search module in BacDup package. This module searches and create gene duplication analysis. It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user annotation data) or a single or multiple samples. """ ## help message if (arg_dict.input_help): help_input() exit() if (arg_dict.blast_help): info.blast_help() exit() if (arg_dict.project_help): info.project_help() exit() if (arg_dict.detached_mode_help): info.detached_mode() exit() ### Start the analysis BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Search module") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out outdir = os.path.abspath(arg_dict.input_folder) ## project or detached? if arg_dict.detached: arg_dict.project = False ## output folder print("\n+ Create output folder(s):") HCGB.functions.files_functions.create_folder(outdir) else: arg_dict.project = True ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting information provided... ') print('+ Several options available:') print('\t* BacDup project folder with initiated data') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\t* Single/Multiple raw BLAST results files') print('\t* Single/Multiple fasta proteins + annotation table') print("""\n\n**** NOTE: **** For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs) use the input module to accommodate accordingly """) time.sleep(1) print() ## parse options pd_samples_retrieved = parse_search_options(arg_dict) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## for each sample dict_search_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "search", arg_dict.debug) dict_dup_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug) dict_parse_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "parse", arg_dict.debug) ## create results data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table()) for sample, folder in dict_search_folders.items(): annot_timestamp = os.path.join(dict_dup_folders[sample], '.annot_success') dup_annot_file = os.path.join(dict_dup_folders[sample], 'dup_annot.csv') ## annotation annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table'] if (not HCGB.functions.files_functions.is_non_zero_file( annot_timestamp)): ## get results file_data = pd_samples_retrieved.loc[sample, 'file_data'] format = pd_samples_retrieved.loc[sample, 'format'] filtered_data = dup_searcher.filter_data( sample, file_data, format, arg_dict.pident, arg_dict.evalue, arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug) ## timestamps filter_timestamp = os.path.join(dict_dup_folders[sample], '.filter_success') if (not HCGB.functions.files_functions.is_non_zero_file( filter_timestamp)): #save results as a .csv file sort_csv = os.path.abspath( os.path.join(dict_dup_folders[sample], 'filtered_results.csv')) filtered_data.to_csv(sort_csv, header=True, index=False) ## print time stamp HCGB_time.print_time_stamp(filter_timestamp) else: read_time = HCGB_time.read_time_stamp(filter_timestamp) print( colored( "\t+ Filter results already available for sample %s [%s]" % (sample, read_time), 'green')) ## get annotation (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot( sample, filtered_data, annot_table_file, arg_dict.debug) ## info_dup_file = os.path.join(dict_dup_folders[sample], 'info_dup.csv') data2add_entry.to_csv(info_dup_file, header=True, index=False) ## save into file dup_annot_df.to_csv(dup_annot_file, header=True) ## print time stamp HCGB_time.print_time_stamp(annot_timestamp) else: read_time = HCGB_time.read_time_stamp(annot_timestamp) print( colored( "\t+ Duplicate annotation already available for sample %s [%s]" % (sample, read_time), 'green')) ## add info for each dup_annot_df = HCGB_main.get_data(dup_annot_file, ',', "index_col=0") annot_table = HCGB_main.get_data(annot_table_file, ',', "index_col=0") data2add_entry = dup_searcher.get_dup_stats( sample, dup_annot_df, annot_table, arg_dict.debug) ## add genome length data data2add_entry['genome_len'] = '' len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv') if os.path.isfile(len_df_file): len_data = HCGB_main.get_data(len_df_file, ',', "header=None") data2add_entry['genome_len'] = len_data[1].sum() ## merge data #data2add_entry = data2add_entry.reset_index() data2add = data2add.append(data2add_entry, ignore_index=False) ### report generation HCGB_aes.boxymcboxface("Summarizing duplicated search") outdir_report = HCGB.functions.files_functions.create_subfolder( "report", outdir) dups_report = HCGB.functions.files_functions.create_subfolder( "dups", outdir_report) ## add data2add data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting search module.") return ()
def Saureus_specific(samples_df, samples_info, options, folder, outdir_dict): """ Retrieves Saureus specific information. See additional information in :doc:`../../user_guide/report/Saureus/saureus_report` """ ######################################## ## get European Quality Control genes ######################################## Staphylococcus_path = os.path.abspath( os.path.join(os.path.realpath(__file__), '..', '..', 'report', 'Staphylococcus')) EQC_genes = os.path.join(Staphylococcus_path, "EQC_genes.csv") arcA_gene = os.path.join(Staphylococcus_path, "arcA.fasta") EQC_genes_df = HCGB_main.get_data(EQC_genes, ',', '') ## Gene,ID,Source ## mecA,ARO:3000617,CARD ## mecC,ARO:3001209,CARD ## mupA,ARO:3000521,CARD ## debugging messages if options.debug: HCGB_aes.debug_message("Saureus_specific", 'yellow') print(Staphylococcus_path) print(arcA_gene) HCGB_aes.debug_message("EQC_genes", 'yellow') print(EQC_genes) print(EQC_genes_df) #################### ## get gene info by unique ID #################### ## get gene names gene_IDs = EQC_genes_df['ID'].to_list() ## outdir_dict #results_Profiles_ids = retrieve_genes.get_genes_profile(samples_info, gene_IDs, options.debug, 'ID') if options.debug: HCGB_aes.debug_message("results_Profiles_ids", 'yellow') # print (results_Profiles_ids) ######################################## ## add additional genes if required ######################################## if options.genes_ids_profile: in_file = os.path.abspath(options.genes_ids_profile) gene_names = [line.rstrip('\n') for line in open(in_file)] if options.debug: print("gene_names") print(gene_names) ## outdir_dict results_Profiles_names = retrieve_genes.get_genes_profile( samples_info, gene_names, options.debug, 'name') if options.debug: print("results_Profiles") print(results_Profiles) ################################# ## get blast sequence ### ################################# # arcA_gene #################### ## get spatyping ## #################### samples_df = samples_df.set_index('name') assembly_files = samples_df.loc[samples_df['tag'] == "assembly", "sample"] results_spaType = pd.DataFrame() #results_spaType = get_spa_typing.module_call(options.database, assembly_files.to_dict(), options.debug) #################### ## get agr typing #################### ## agr_results = agr_typing.agrvate_caller(assembly_files.to_dict(), outdir_dict, options.debug) ## copy excel file and operon into report folder ## remove from dataframe #################### ## get sccmec #################### ## todo #################### ## save results #################### ## open excel writer name_excel = folder + '/Saureus_report.xlsx' writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') # results_Profiles ids results_Profiles_ids.to_excel(writer, sheet_name="gene_ids") if options.genes_ids_profile: # results_Profiles names results_Profiles_ids.to_excel(writer, sheet_name="gene_names") # results_spaType results_spaType.to_excel(writer, sheet_name="spaTyper") # agr_results agr_results.to_excel(writer, sheet_name="agr typing") ## close writer.save()
def parse_card(folder, sampleName, fileResults, fileFlags, summary, assembly_cutoff, card_trick_info): ## ## Parses results from CARD database. ## Input is a folder for output results, sample name, and ## fileResults is report.tsv generated by ariba ## fileFlags was generated by results_parser calling ariba expand flags, ## summary is the summary file generated by ariba summary ## ## get data summary_data = pd.read_csv( summary, header=0, sep=',') ## report_summary.csv :: parse information from ARIBA fileFlags_data = pd.read_csv( fileFlags, header=0, sep='\t') ## flags_explain.tsv :: ariba expand flag: explained flags original_data = pd.read_csv( fileResults, header=0, sep='\t') ## report.tsv :: ariba report generated card_ontology = HCGB_main.get_data( card_trick_info + '/aro.obo.csv', ',', 'index_col=0') ## read card_info generated for card_trick parse ## summary data summary_data = summary_data.set_index('name') list_found_genes = summary_data.columns cluster_len = len(list_found_genes) ## print info print("\tCheck CARD result: ", sampleName) print( "\t%s genes putatively involved in resistance to some antibiotics..." % cluster_len) ## subset data = original_data.loc[original_data['cluster'].isin( summary_data.columns)] ############################################################################ ## analyze each cluster confering resistance ############################################################################ colnames = [ 'Reference', 'ID', 'Protein-coding', 'Presence/Absence', 'Variants', 'Description', 'Additional information' ] ## get results: conferring resistance df_results = found_results(colnames, data, list_found_genes, 'CARD') ## get results: found, identified, partial df_identified = identified_results(original_data, "CARD", list_found_genes, assembly_cutoff) ############################################################################ ## use card-trick python package to get ontology for each term AROS_identified = list(df_identified['ID']) information_ontology = card_trick_caller.get_info_CARD( AROS_identified, 'ARO', card_ontology) ########################## ## generate excel sheet ########################## ## open excel handle name_excel = folder + '/' + sampleName + '_CARD_results.xlsx' writer = pd.ExcelWriter(name_excel, engine='xlsxwriter') ## write excel handle df_results.to_excel(writer, sheet_name='results') ## write results df_identified.to_excel( writer, sheet_name='identified') ## Identified genes: ARIBA flags explained information_ontology.to_excel(writer, sheet_name='CARD_ontology') ## CARD ontology original_data.to_excel( writer, sheet_name='ARIBA_report') ## Original data from ARIBA summary_data.to_excel( writer, sheet_name='ARIBA_summary') ## ARIBA summary generated fileFlags_data.to_excel(writer, sheet_name='flags') ## ARIBA flags explained name_csv = folder + '/' + sampleName + '_CARD_summary.csv' df_identified.to_csv(name_csv) ## close excel handle writer.save() return (name_excel, name_csv)
def update_database_user_data(database_folder, project_folder, Debug, options): """ Updates user_data folder within the database folder provided. It would generate single subfolder for each sample previously analyzed and it would store main information and result files for later interpretation, comparison and/or summarization with new samples analyzed. :param database_folder: :param project_folder: :param Debug: :param options: :type database_folder: :type project_folder: :type Debug: :type options: :returns: Updated database result from :func:`BacterialTyper.scripts.database_generator.update_db_data_file`. :rtype: Dataframe :warnings: Returns **FAIL** if check process failed. .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.files_functions.create_subfolder` - :func:`HCGB.functions.main_functions.functions.get_data` - :func:`HCGB.functions.main_functions.optimize_threads` - :func:`BacterialTyper.scripts.database_user.get_userData_files` - :func:`BacterialTyper.scripts.database_user.update_sample` - :func:`BacterialTyper.scripts.database_generator.getdbs` - :func:`BacterialTyper.scripts.database_generator.get_database` - :func:`BacterialTyper.scripts.database_generator.update_db_data_file` """ print("\n+ Updating information from user data folder: ", project_folder) ## create folder own_data = HCGB_files.create_subfolder("user_data", database_folder) ## Default missing options options.project = True options.debug = Debug if not options.single_end: options.pair = True #################################### ## get information #################################### ## get user data files project_data_df = get_userData_files(options, project_folder) ## get user data info project_info_df = get_userData_info(options, project_folder) ## merge data project_all_data = pd.concat([project_data_df, project_info_df], join='outer', sort=True).drop_duplicates() #project_all_data.index.name = 'name' ## debug messages: if Debug: HCGB_aes.debug_message("project_data_df", 'yellow') print(project_data_df) HCGB_aes.debug_message("project_info_df", 'yellow') print(project_info_df) HCGB_aes.debug_message("project_all_data", 'yellow') print(project_all_data) print('\n+ Get database information') db_frame = database_generator.getdbs('user_data', database_folder, 'user_data', Debug) user_data_db = database_generator.get_database(db_frame, Debug) ## merge dataframe sample_frame = project_all_data.groupby("name") #################################### ## optimize threads #################################### name_list = project_all_data.index.values.tolist() threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) print('\n+ Updating information using %s threads and %s parallel jobs' % (options.threads, max_workers_int)) #################################### ## loop through frame using multiple threads #################################### with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: ## send for each commandsSent = { executor.submit(update_sample, name, cluster, own_data, user_data_db, Debug): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) HCGB_aes.print_sepLine("+", 75, False) print("\n+ Retrieve information ...") #################################### ###### populate dataframe #################################### for name, cluster in sample_frame: ###### dump to file info_file = own_data + '/' + name + '/info.txt' if os.path.exists(info_file): dataGot = HCGB_main.get_data(info_file, ',', 'index_col=0') dataGot = dataGot.set_index('ID') if (options.debug): print(colored("**DEBUG: dataGot dataframe **", 'yellow')) print(dataGot) user_data_db = pd.concat([user_data_db, dataGot], join='outer', sort=True).drop_duplicates() ## concatenating by outer we get all available entries if (options.debug): print(colored("**DEBUG: user_data_db dataframe **", 'yellow')) print(user_data_db) HCGB_aes.print_sepLine("+", 75, False) #################################### ## update db #################################### database_csv = own_data + '/user_database.csv' dataUpdated = database_generator.update_db_data_file( user_data_db, database_csv) print("+ Database has been generated: \n", database_csv) return (dataUpdated)
def get_R_packages(): dep_file = os.path.abspath( os.path.join(os.path.dirname(__file__), 'R', 'R_dependencies.csv')) dep_file_data = HCGB_main.get_data(dep_file, ',', 'index_col=0') return (dep_file_data)
def check_perl_packages(file_name, Debug, option_install, install_path): """ Check the perl packages required This functions checks whether the packages installed in the system fulfilled the minimum version specified in the configuration file. Details of the perl packages required are available in :file:`BacterialTyper/config/perl/`. It uses function :func:`BacterialTyper.config.set_config.get_perl_packages` to retrieve the version of the perl packages installed in the system. Then it uses :func:`BacterialTyper.config.extern_progs.min_perl_package_version` to retrieve the minimum version specified. It compares them using function :func:`BacterialTyper.config.set_config.check_install_module`. :param file_name: Name of the file to search within :file:`BacterialTyper/config/perl/`. :param Debug: True/False for debugging messages :param option_install: True/False for installing missing dependencies :param install_path: Install path for installing modules. :type file_name: string :type Debug: boolean :type option_install: boolean :type install_path: string :returns: Print messages if packages are installed. .. seealso:: This function relies on other ``BacterialTyper`` functions: - :func:`BacterialTyper.config.set_config.get_perl_packages` - :func:`BacterialTyper.config.set_config.check_install_module` - :func:`BacterialTyper.config.extern_progs.min_perl_package_version` - :func:`BacterialTyper.config.install_dependencies.perl_package_install` """ ## get perl packages installed my_packages_installed = get_perl_packages(Debug, file_name) ## debug messages if (Debug): print("my_packages_installed :: ") print(my_packages_installed) ## min versions for packages my_packages_requirements = extern_progs.min_perl_package_version(file_name) ## debug messages if (Debug): print("my_packages_requirements") print(my_packages_requirements) ## get info for perl modules perl_lib_dependecies_file = extern_progs.file_list(file_name) perl_lib_dependecies = HCGB_main.get_data(perl_lib_dependecies_file, ',', 'index_col=0') ## check each package for each in my_packages_requirements: ## get min version min_version = my_packages_requirements[each] ## get version installed in system installed = my_packages_installed[each] ## module name conversion module_name = perl_lib_dependecies.loc[each, 'module'] ## debug messages if (Debug): print("Module:", each) print("Module name:", module_name) print("Min_Version:", min_version) print("Version installed:", installed) ## check if installed message = check_install_module(installed, module_name, min_version, 'Package') if (message == 'OK'): continue else: print( colored( "** ATTENTION: Installation of perl modules is not supported", 'red')) print("+ Please install manually package: ", module_name, " to continue with BacterialTyper\n\n")
def run_database(options): ## init time start_time_total = time.time() start_time_partial = start_time_total ## debugging messages global Debug if (options.debug): Debug = True print("[Debug mode: ON]") else: Debug = False ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Database") print("--------- Starting Process ---------") HCGB_time.print_time() kma_bin = set_config.get_exe("kma") ###################################################### ## print further information if requested if (options.help_ARIBA): print("ARIBA databases information:") ariba_caller.help_ARIBA() exit() elif (options.help_BUSCO): BUSCO_caller.print_help_BUSCO() exit() elif (options.help_KMA): species_identification_KMA.help_kma_database() exit() ###################################################### ## create folder ## absolute options.path = os.path.abspath(options.path) HCGB_files.create_folder(options.path) ######### if Debug: print(colored("DEBUG: absolute path folder: " + options.path, 'yellow')) ########## ## NCBI ## ########## ## if any NCBI options provided if any([options.ID_file, options.descendant]): ## create folders NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path) if (options.ID_file): ## get path and check if it is file abs_path_file = os.path.abspath(options.ID_file) if os.path.isfile(abs_path_file): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check NCBI ids provided ---------\n") HCGB_aes.print_sepLine("*", 70, False) ## get file information print("\t+ Obtaining information from file: %s" % abs_path_file) strains2get = HCGB_main.get_data(abs_path_file, ',', '') dataBase_NCBI = database_generator.NCBI_DB( strains2get, NCBI_folder, Debug) ######### if Debug: print(colored("DEBUG: NCBI data provided: ", 'yellow')) print(options.ID_file) ## functions.timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## strains downloaded would be included to a kma index ## Get all entries belonging to this taxon provided if (options.descendant): ######### if Debug: print(colored("DEBUG: NCBI descendant option: ON ", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print( "--------- Check descendant NCBI taxonomy ids provided ---------\n" ) HCGB_aes.print_sepLine("*", 70, False) ## [TODO] dataBase_NCBI = database_generator.NCBI_descendant( options.descendant, NCBI_folder, Debug) ############################################################## ## update KMA database with NCBI information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_NCBI['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db) print('+ Database to update: ', genbank_kma_db) species_identification_KMA.generate_db(list_of_files, 'genbank_KMA', genbank_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ############### ## user_data ## ############### if options.project_folder: ## dataBase_user = pd.DataFrame() ## get absolute path abs_project_folder = os.path.abspath(options.project_folder) if os.path.exists(abs_project_folder): ######### if Debug: print( colored("DEBUG: User provides folder containing project", 'yellow')) print() HCGB_aes.print_sepLine("*", 70, False) print("--------- Check user provided project folder ---------") HCGB_aes.print_sepLine("*", 70, False) dataBase_user = database_user.update_database_user_data( options.path, abs_project_folder, Debug, options) else: print( colored( "ERROR: Folder provided does not exists: %s" % options.project_folder, 'red')) exit() ############################################################## ## update KMA database with user_data information retrieved ############################################################## print('\n\n+ Update database for later identification analysis...') list_of_files = dataBase_user['genome'].tolist() kma_db = HCGB_files.create_subfolder('KMA_db', options.path) user_kma_db = HCGB_files.create_subfolder('user_data', kma_db) print('+ Database to update: ', user_kma_db) species_identification_KMA.generate_db(list_of_files, 'userData_KMA', user_kma_db, 'new', 'batch', Debug, kma_bin) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ########## ## ARIBA ########## print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check ARIBA parameters provided --------") HCGB_aes.print_sepLine("*", 50, False) if (options.no_ARIBA): print("+ No ARIBA databases would be downloaded...") ######### if Debug: print(colored("DEBUG: No option ARIBA", 'yellow')) else: #functions.print_sepLine("*",50, False) ### ariba list databases ariba_dbs_list = ['CARD', 'VFDB'] if (options.no_def_ARIBA): ariba_dbs_list = options.ariba_dbs else: if (options.ariba_dbs): ariba_dbs_list = ariba_dbs_list + options.ariba_dbs ariba_dbs_list = set(ariba_dbs_list) ######### if Debug: print(colored("DEBUG: Option ARIBA", 'yellow')) print(options.ariba_dbs) ariba_caller.download_ariba_databases(ariba_dbs_list, options.path, Debug, options.threads) ### ariba list databases if (options.ariba_users_fasta): print( "+ Generate ARIBA database for databases provided: prepare fasta and metadata information" ) ######### if Debug: print(colored("DEBUG: Option user ARIBA db", 'yellow')) print(ariba_users_fasta) print(ariba_users_meta) ## [TODO]: ## ariba prepareref fasta and metadata ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ######### ## kma ## ######### print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check KMA parameters provided ----------") kma_database = options.path + '/KMA_db' HCGB_files.create_folder(kma_database) ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains ## downloads all "bacterial" genomes from KMA website ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/ print( "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website" ) ## KMA databases to use ## only user dbs if (options.no_def_kma): if (options.kma_dbs): print("+ Only user databases selected will be indexed...") else: print("+ No databases selected.") print(colored("ERROR: Please select a kma database.", 'red')) exit() ## default dbs + user else: kma_dbs = ["bacteria", "plasmids"] ## default dbs + user if (options.kma_dbs): options.kma_dbs = options.kma_dbs + kma_dbs options.kma_dbs = set(options.kma_dbs) else: options.kma_dbs = kma_dbs ######### if Debug: print(colored("DEBUG: options.kma_dbs", 'yellow')) print(options.kma_dbs) ## Get databases for db in options.kma_dbs: print(colored("\n+ " + db, 'yellow')) db_folder = HCGB_files.create_subfolder(db, kma_database) species_identification_KMA.download_kma_database(db_folder, db, Debug) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) ########### ## BUSCO ## ########### if (options.BUSCO_dbs): print() HCGB_aes.print_sepLine("*", 50, False) print("--------- Check BUSCO datasets provided ---------") BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path) ######### if Debug: print(colored("DEBUG: options.BUSCO_dbs", 'yellow')) print(options.BUSCO_dbs) print("+ BUSCO datasets would be downloaded when executed...") #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder) ### timestamp start_time_partial = HCGB_time.timestamp(start_time_partial) print("\n*************** Finish *******************\n") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Database module.\n") return ()