Python get_data 예제들, HCGB.functions.main_functions.get_data Python 예제들

예제 #1

0

파일 보기

파일: database_generator.py 프로젝트: HCGB-IGTP/BacterialTyper

def get_database(db_frame, Debug):
    data4db = pd.DataFrame()
    for index, row in db_frame.iterrows():
        ## information
        this_file = db_frame.loc[index]['path'] + '/info.txt'
        if os.path.isfile(this_file):
            print('+ Reading information for sample: ',
                  db_frame.loc[index]['db'])
            print(
                colored("\t+ Obtaining information from file: %s" % this_file,
                        'yellow'))
            this_db = HCGB_main.get_data(this_file, ',', 'index_col=0')
            data4db = data4db.append(this_db)
            timestamp = db_frame.loc[index]['path'] + '/.success'
            if os.path.isfile(timestamp):
                stamp = HCGB_time.read_time_stamp(timestamp)
                print(colored("\t+ Data generated on: %s" % stamp, 'yellow'))

            HCGB_aes.print_sepLine("*", 25, False)

    ## index by ID
    if not data4db.empty:
        data4db = data4db.set_index('ID')

    return (data4db)

예제 #2

0

파일 보기

파일: set_config.py 프로젝트: HCGB-IGTP/BacterialTyper

def get_perl_packages(Debug, file_name):
    """
	Retrieves the version of the perl packages installed in the system.

	It retrieves the dependencies name conversion from file :file:`BacterialTyper/config/perl/perl_dependencies.csv`
	using function :func:`BacterialTyper.config.extern_progs.file_list` and :func:`BacterialTyper.scripts.functions.get_data`.
	For each module it retrieves the package version installed in the system using 
	:func:`BacterialTyper.config.set_config.check_perl_package_version`.	

	:returns: Dictionary containing for each perl module (key) the installed version (value).

	.. seealso:: This function relies on other ``BacterialTyper`` functions:

		- :func:`BacterialTyper.config.set_config.check_perl_package_version`

		- :func:`BacterialTyper.config.extern_progs.file_list` 

		- :func:`BacterialTyper.scripts.functions.get_data`

	"""
    ## get info for perl modules
    perl_lib_dependecies_file = extern_progs.file_list(file_name)
    perl_lib_dependecies = HCGB_main.get_data(perl_lib_dependecies_file, ',',
                                              'index_col=0')

    my_packages_installed = {}
    for index_name, row in perl_lib_dependecies.iterrows():
        module_name = row['module']
        installed = check_perl_package_version(
            module_name, Debug)  ## check version installed in system
        if not (installed):
            installed = 'n.a.'
        my_packages_installed[index_name] = installed

    return (my_packages_installed)

예제 #3

0

파일 보기

def get_info_software():
	"""Read software information
	
	Reads information stored in file :file:`BacterialTyper.config.software.software_details.csv`
	and returns pandas data frame.	
	"""
	info_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), 'software', 'software_details.csv'))
	return(HCGB_main.get_data(info_file, ',', 'index_col=0'))

예제 #4

0

파일 보기

파일: database_generator.py 프로젝트: HCGB-IGTP/BacterialTyper

def update_db_data_file(data, csv):
    if os.path.isfile(csv):
        print("\n+ Updating database")
        print("+ Obtaining information from database file: %s" % csv)
        db2update = HCGB_main.get_data(csv, ',', 'index_col=0')

        ## TODO: provide preference to db2update
        df = pd.concat([db2update, data], join='inner',
                       sort=True).drop_duplicates()
        df.to_csv(csv)
        return (df)
    else:
        data.to_csv(csv)
        return (data)

예제 #5

0

파일 보기

def retrieve_genes_ids_profile(profile, gene_ID, debug, option):
    """    
    """
    ## read data    
    get_csv_data = HCGB_main.get_data(profile, ',', '')
    
    if option == 'name':
        list_Genes = get_csv_data['Genes'].to_list()
        get_csv_data.index = get_csv_data['Genes']
    elif option == 'ID':
        list_Genes = get_csv_data['ID'].to_list()
        get_csv_data.index = get_csv_data['ID']

    
    ## debug messages
    if debug:
        HCGB_aes.debug_message('profile: ' + profile, 'yellow')
        HCGB_aes.debug_message('gene_id: ' + str(gene_ID), 'yellow')
        HCGB_aes.debug_message('data: ', 'yellow')
        print(get_csv_data)
        HCGB_aes.debug_message('Option: ' + option, 'yellow')
        HCGB_aes.debug_message('Genes: ', 'yellow')
        print (list_Genes)
        
    ## search accordingly
    if option == 'name':
        regex_search = re.compile("^" + gene_ID + ".*")
        filtered_genes = list(filter(regex_search.match, list_Genes))
        
        ## debug messages
        if debug:
            HCGB_aes.debug_message('filtered_genes: ', 'yellow')
            print (filtered_genes)
            HCGB_aes.debug_message('filtered_genes.loc[filtered_genes]: ', 'yellow')
            print (get_csv_data.loc[filtered_genes])
        
        return (get_csv_data.loc[filtered_genes]) 
        
    else:
        if gene_ID in list_Genes:
            ## debug messages
            if debug:
                HCGB_aes.debug_message('gene_id: ' + gene_ID, 'yellow')
                print (get_csv_data.loc[gene_ID].to_frame().transpose())
                
            return (get_csv_data.loc[gene_ID].to_frame().transpose())
        else:
            return(pd.DataFrame())

예제 #6

0

파일 보기

파일: BUSCO_caller.py 프로젝트: HCGB-IGTP/BacterialTyper

def busco_datasets():
	"""BUSCO dataset information
	
	:return: List containing information for each dataset available in file BUSCO_dataset.csv under data directory.
	
	.. seealso:: Additional information on BUSCO datasets available.
	
		- :doc:`BUSCO datasets <../../../data/BUSCO_datasets>` 
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.data.data_files.data_list`
		
		- :func:`BacterialTyper.scripts.functions.file2dataframe`
	"""
	## read from file: BUSCO_dataset.csv
	BUSCO_dataset_file = data.data_files.data_list("BUSCO_dataset")
	busco_data = HCGB_main.get_data(BUSCO_dataset_file, ",", options="")
	return(busco_data)

예제 #7

0

파일 보기

def agrvate_call(sample, assembly_file, folder, debug=False):
    """agrvate call and check results."""
    
    ## prepare call
    log_call = os.path.join(folder, "agrvate_cmd.log")
    err_call = os.path.join(folder, "agrvate_cmd.err")
    agrvate_bin = set_config.get_exe('agrvate')
    
    ## system call
    cmd_call = "%s -i %s -m -f >  %s 2> %s " %(agrvate_bin, 
                                               assembly_file,
                                               log_call, err_call) ## use mummer (-m) and force results folder (-f)
    status = HCGB_sys.system_call(cmd_call)
    
    ## check results
    ## see https://github.com/VishnuRaghuram94/AgrVATE#results for additional details
    results = pd.DataFrame()
    
    ## check folder is created
    assembly_file_name = os.path.basename(assembly_file).split('.fna')[0]    
    original_results_folder = os.path.join(folder, assembly_file_name + '-results')
    results_folder = os.path.join(folder, 'agrvate_results')
    
    if os.path.isdir(original_results_folder):
        print("+ Results folder generated OK")
        print("+ Check results generated:")
        
        ## rename folder
        os.rename(original_results_folder, results_folder)
        os.rename(os.path.join(folder, assembly_file_name + '.fna-error-report.tab'), os.path.join(results_folder, 'error_report.tab'))
        
        ## write to excel
        file_name_Excel = os.path.join(folder, sample + '_agr_results.xlsx')
        writer_Excel = pd.ExcelWriter(file_name_Excel, engine='xlsxwriter') ## open excel handle
    
        ## get all files
        list_files = HCGB_main.get_fullpath_list(results_folder)
    
        ## summary tab
        summary_tab_file = [s for s in list_files if s.endswith("summary.tab")][0]
        summary_tab =  HCGB_main.get_data(summary_tab_file, '\t', options="")
        summary_tab['sample'] = sample
        
        ## columns
        #agr_group: gp1/gp2/gp3/gp4. 'u' means unknown. 
        ##           If multiple agr groups were found (col 5 = m), 
        ##           the displayed agr group is the majority/highest confidence. 
        # match_score: maximum 15; 0 means untypeable; < 5 means low confidence.
        # canonical_agrD: 1 means canonical; 0 means non-canonical; u means unknown.
        # multiple_agr:  s means single, m means multiple, u means unknown ) 
        ##               Multiple groups are found likely due to multiple S. aureus isolates in sequence
        # frameshifts: Number found in CDS of extracted agr operon ('u' if agr operon not extracted)
        
        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: Summary tab file", 'yellow')
            print(summary_tab_file)
            print(summary_tab)

        ## add summary results to all results
        del summary_tab['#filename']
        results = summary_tab.copy()

        ## save summary_tab into excel
        ## tab summary
        summary_tab.to_excel(writer_Excel, sheet_name='summary') ## write excel handle

        ## agr_gp tab
        agr_gp_tab_file = [s for s in list_files if s.endswith("agr_gp.tab")][0]
        if HCGB_files.is_non_zero_file(agr_gp_tab_file):
            agr_gp_tab =  HCGB_main.get_data(agr_gp_tab_file, '\t', options='header=None')
            agr_gp_tab.columns = ['contig', 'agr', 'evalue', 'identity', 'start', 'end']
            agr_gp_tab['sample'] = sample
            
            ## columns
            ## Assembly Contig ID
            ## ID of matched agr group kmer
            ## evalue
            ## Percentage identity of match
            ## Start position of kmer alignment on input sequence
            ## End position of kmer alignment on input sequence
    
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_gp file", 'yellow')
                print(agr_gp_tab_file)
                print(agr_gp_tab)
            
            ## save agr_gp_tab file into excel
            ## tab operon
            agr_gp_tab.to_excel(writer_Excel, sheet_name='operon') ## write excel handle

        ## agr_operon fna
        try:
            agr_operon_fna_file = [s for s in list_files if s.endswith("agr_operon.fna")][0]
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_operon file", 'yellow')
                print(agr_operon_fna_file)
            
            results['operon_fna'] = agr_operon_fna_file
        except:
            results['operon_fna'] = ''

        ## agr_operon fna
        error_report_file = [s for s in list_files if s.endswith("error_report.tab")][0]
        error_report =  HCGB_main.get_data(error_report_file, '\t', options="")
        del error_report['#input_name']

        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: error_report.tab file", 'yellow')
            print(error_report_file)
            print(error_report)
            
        ## save error_report file into excel
        ## tab steps
        error_report.to_excel(writer_Excel, sheet_name='steps') ## write excel handle
        
        ## merge results
        results = pd.concat([results, error_report], axis=1)

        ## close xlsx file
        writer_Excel.save() ## close excel handle
    
        ## add to pandas dataframe
        results['agr_operon_xlsx'] = file_name_Excel

    ## debug messages
    if debug:
        HCGB_aes.debug_message("agrvate results", 'yellow')
        HCGB_main.print_all_pandaDF(results)
        
    return (results)

예제 #8

0

파일 보기

파일: RNAbiotype.py 프로젝트: HCGB-IGTP/XICRA

def biotype_all(featureCount_exe, path, gtf_file, bam_file, name, threads, Debug, allow_multimap, stranded):
	
	## folder for results
	if not os.path.isdir(path):
		files_functions.create_folder(path)

	out_file = os.path.join(path, 'featureCount.out')
	logfile = os.path.join(path, name + '_RNAbiotype.log')

	filename_stamp_all = path + '/.success_all'
	if os.path.isfile(filename_stamp_all):
		stamp = time_functions.read_time_stamp(filename_stamp_all)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'RNAbiotype'), 'yellow'))
		return()

	else:
		filename_stamp_featureCounts = path + '/.success_featureCounts'
		if os.path.isfile(filename_stamp_featureCounts):
			stamp = time_functions.read_time_stamp(filename_stamp_featureCounts)
			print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'featureCounts'), 'yellow'))
		else:

			## debugging messages
			if Debug:
				print ("** DEBUG:")
				print ("featureCounts system call for sample: " + name)
				print ("out_file: " + out_file)
				print ("logfile: " + logfile)
		
			## send command for feature count
			## Allow multimapping
			if allow_multimap:
				cmd_featureCount = ('%s -s %s -M -O -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
			else:
				cmd_featureCount = ('%s -s %s --largestOverlap -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
				
				
			## system call
			cmd_featureCount_code = system_call_functions.system_call(cmd_featureCount, False, True)
			if not cmd_featureCount_code:
				print("** ERROR: featureCount failed for sample " + name)
				exit()
				
			## print time stamp
			time_functions.print_time_stamp(filename_stamp_featureCounts)
		
		## parse results
		(extended_Stats_file, RNAbiotypes_stats_file) = parse_featureCount(out_file, path, name, bam_file, Debug)
		
		## debugging messages
		if Debug:
			print ("** DEBUG:")
			print ("extended_Stats: " + extended_Stats_file)
			print (main_functions.get_data(extended_Stats_file, '\t', 'header=None'))
			print ("RNAbiotypes_stats: " + RNAbiotypes_stats_file)
			print (main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None'))

	return ()

예제 #9

0

파일 보기

파일: RNAbiotype.py 프로젝트: HCGB-IGTP/XICRA

def pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug):
	
	##
	filename_stamp_plot = folder + '/.success_plot'
	if os.path.isfile(filename_stamp_plot):
		stamp = time_functions.read_time_stamp(filename_stamp_plot)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'plot results'), 'yellow'))
	else:
	
		# PLOT and SHOW results
		RNAbiotypes_stats = main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None')
	
		# create plot
		plt.figure(figsize=(16,8))
		df_genetype_2 = pd.DataFrame({'Type':RNAbiotypes_stats[0], 
									'Count':RNAbiotypes_stats[1]}).sort_values(by=['Count'])
	
		## get total count
		df_genetype_ReadCount_sum = df_genetype_2['Count'].sum()
	
		## filter 1% values
		minimun = df_genetype_ReadCount_sum * 0.01
		df_genetype_filter_greater = df_genetype_2[ df_genetype_2['Count'] >= minimun ]
		df_genetype_filter_smaller = df_genetype_2[ df_genetype_2['Count'] < minimun ]
	
		## create %values
		df_genetype_2['Percentage'] = (df_genetype_2['Count']/df_genetype_ReadCount_sum*100).round(3)
		
		## merge and generate Other class
		df_genetype_filter_smaller_sum = df_genetype_filter_smaller['Count'].sum() ## total filter smaller
		df_genetype_filter_greater2 = df_genetype_filter_greater.append({
			'Count':df_genetype_filter_smaller_sum, 
			'Type':'Other'}, ignore_index=True)
	
		## Create Pie Plot
		ax1 = plt.subplot(121, aspect='equal')
		df_genetype_filter_greater2.plot.pie(
			y = 'Count', 
			ax=ax1, 
			autopct='%1.2f%%', 
			shadow=False, 
			labels=df_genetype_filter_greater2['Type'], 
			legend = False)
	
		# plot table
		ax2 = plt.subplot(122)
		plt.axis('off')
		tbl = ax2.table(
			cellText=df_genetype_2.values, 
			colLabels=df_genetype_2.columns,
			loc='center', rowLoc='left', cellLoc='center', 
			)
		tbl.auto_set_font_size(True)
		#tbl.set_fontsize(12)
		tbl.scale(1.1,1.1)
	
		## set PDF name
		name_figure = os.path.join(folder, name + '_RNAbiotypes.pdf')
	
		## generate image
		plt.savefig(name_figure)
		plt.close(name_figure)

		## print time stamps
		time_functions.print_time_stamp(filename_stamp_plot)
		filename_stamp_all = folder + '/.success_all'
		time_functions.print_time_stamp(filename_stamp_all)

예제 #10

0

파일 보기

파일: database_generator.py 프로젝트: HCGB-IGTP/BacterialTyper

def NCBI_DB(strains2get, data_folder, Debug):
    """Donwloads given taxa from NCBI if not available and updates database information.
	
	This function checks in the given folder if strain of interest is available. If not it would connect to NCBI using python module ncbi_genome_download and downloads some information.
	
	:param strains2get: dataframe containing genus, species and NCBI assembly columns among others. See example below.
	:param data_folder: Absolute path to database NCBI folder.
	:param Debug: Print messages for debugging purposes if desired. 
	:type strains2get: dataframe
	:type data_folder: string
	:type Debug: bool
	:return: Dataframe of genbank database updated for all available entries.

	Columns for the dataframe :file:`strains2get` consist of:
	
	sample,genus,species,strain,BioSample,genome,Plasmids
 
	See and example in file: :file:`/devel/results/strains2get_NCBI_DB.csv` and shown here:
	
	.. include:: ../../devel/results/strains2get_NCBI_DB.csv
		:literal:
		
	See example of the return dataframe, containing database information updated in file: :file:`/devel/results/genbank_database.csv` here:
	
	.. include:: ../../devel/results/genbank_database.csv
		:literal:
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.file_funtcions.create_folder`
	
		- :func:`HCGB.functions.main_functions.get_data`
	
		- :func:`BacterialTyper.scripts.database_generator.get_dbs`
	
		- :func:`BacterialTyper.scripts.database_generator.get_database`
		
		- :func:`BacterialTyper.scripts.database_generator.NCBIdownload`
		
		- :func:`BacterialTyper.scripts.database_generator.update_db_data_file`
		
	.. include:: ../../links.inc	 	
	
	"""

    ## set index
    strains2get = strains2get.set_index(
        'NCBI_assembly_ID', drop=False)  ## set new index but keep column
    strains2get.index.names = ['ID']  ## rename index
    strains2get = strains2get.drop_duplicates()

    #########
    if Debug:
        print(colored("DEBUG: NCBI data provided: ", 'yellow'))
        print(strains2get)

    ## get data existing database
    print("+ Create the database in folder: \n", data_folder)
    HCGB_files.create_folder(data_folder)

    ## read database
    db_frame = getdbs('NCBI', data_folder, 'genbank', Debug)
    database_df = get_database(db_frame, Debug)

    #########
    if Debug:
        print(colored("DEBUG: NCBI genbank database retrieved: ", 'yellow'))
        print("db_frame")
        print(db_frame)
        print()

        print("database_df")
        print(database_df)

    ## loop and download
    for index, row in strains2get.iterrows():
        HCGB_aes.print_sepLine("+", 75, False)
        acc_ID = index  #strains2get.loc[index]['NCBI_assembly_ID']
        info = "Genus: " + strains2get.loc[index][
            'genus'] + '\n' + "Species: " + strains2get.loc[index][
                'species'] + '\n' + "Strain: " + strains2get.loc[index][
                    'name'] + '\n' + "ID accession: " + acc_ID + '\n'
        dir_path = data_folder + '/genbank/bacteria/' + acc_ID  ## module ngd requires to download data in bacteria subfolder under genbank folder

        ## check if already exists
        if acc_ID in database_df.index:
            print("\n+ Data is already available in database for: ")
            print(colored(info, 'green'))

        else:
            ## download
            print("\n+ Downloading data for:")
            print(colored(info, 'green'))
            data_accID = NCBIdownload(acc_ID, strains2get, data_folder)
            this_db = HCGB_main.get_data(data_accID, ',', 'index_col=0')
            this_db = this_db.set_index('ID')
            database_df = database_df.append(this_db)

    ## Generate/Update database
    database_csv = data_folder + '/genbank_database.csv'
    db_updated = update_db_data_file(database_df, database_csv)
    print("+ Database has been generated in file: ", database_csv)
    return (db_updated)

예제 #11

0

파일 보기

def run_search(arg_dict):
    """Main function of the search module in BacDup package.
    
    This module searches and create gene duplication analysis. 
    
    It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user
    annotation data) or a single or multiple samples.    
    """

    ## help message
    if (arg_dict.input_help):
        help_input()
        exit()

    if (arg_dict.blast_help):
        info.blast_help()
        exit()

    if (arg_dict.project_help):
        info.project_help()
        exit()

    if (arg_dict.detached_mode_help):
        info.detached_mode()
        exit()

    ### Start the analysis
    BacDup_functions.pipeline_header('BacDup')
    HCGB_aes.boxymcboxface("Search module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    outdir = os.path.abspath(arg_dict.input_folder)

    ## project or detached?
    if arg_dict.detached:
        arg_dict.project = False
        ## output folder
        print("\n+ Create output folder(s):")
        HCGB.functions.files_functions.create_folder(outdir)
    else:
        arg_dict.project = True

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('Project/Detached option:', 'yellow')
        debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow')
        debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow')
        debug_message('outdir:' + outdir, 'yellow')
        debug_message('+++++++++++++++++++++++++++++++')

    ## get files
    print()
    HCGB_aes.print_sepLine("-", 50, False)
    print('+ Getting information provided... ')
    print('+ Several options available:')
    print('\t* BacDup project folder with initiated data')
    print('\t* Single/Multiple Annotation file:')
    print('\t  |-- GenBank format files')
    print('\t  |-- GFF files +  Reference fasta files required')
    print('\t* Single/Multiple raw BLAST results files')
    print('\t* Single/Multiple fasta proteins + annotation table')

    print("""\n\n**** NOTE: **** 
    For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs)
    use the input module to accommodate accordingly """)
    time.sleep(1)

    print()

    ## parse options
    pd_samples_retrieved = parse_search_options(arg_dict)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## for each sample
    dict_search_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "search",
        arg_dict.debug)

    dict_dup_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug)

    dict_parse_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "parse",
        arg_dict.debug)

    ## create results
    data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table())
    for sample, folder in dict_search_folders.items():

        annot_timestamp = os.path.join(dict_dup_folders[sample],
                                       '.annot_success')
        dup_annot_file = os.path.join(dict_dup_folders[sample],
                                      'dup_annot.csv')

        ## annotation
        annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table']

        if (not HCGB.functions.files_functions.is_non_zero_file(
                annot_timestamp)):

            ## get results
            file_data = pd_samples_retrieved.loc[sample, 'file_data']
            format = pd_samples_retrieved.loc[sample, 'format']
            filtered_data = dup_searcher.filter_data(
                sample, file_data, format, arg_dict.pident, arg_dict.evalue,
                arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug)

            ## timestamps
            filter_timestamp = os.path.join(dict_dup_folders[sample],
                                            '.filter_success')
            if (not HCGB.functions.files_functions.is_non_zero_file(
                    filter_timestamp)):
                #save results as a .csv file
                sort_csv = os.path.abspath(
                    os.path.join(dict_dup_folders[sample],
                                 'filtered_results.csv'))
                filtered_data.to_csv(sort_csv, header=True, index=False)

                ## print time stamp
                HCGB_time.print_time_stamp(filter_timestamp)
            else:
                read_time = HCGB_time.read_time_stamp(filter_timestamp)
                print(
                    colored(
                        "\t+ Filter results already available for sample %s [%s]"
                        % (sample, read_time), 'green'))

            ## get annotation
            (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot(
                sample, filtered_data, annot_table_file, arg_dict.debug)

            ##
            info_dup_file = os.path.join(dict_dup_folders[sample],
                                         'info_dup.csv')
            data2add_entry.to_csv(info_dup_file, header=True, index=False)

            ## save into file
            dup_annot_df.to_csv(dup_annot_file, header=True)

            ## print time stamp
            HCGB_time.print_time_stamp(annot_timestamp)

        else:
            read_time = HCGB_time.read_time_stamp(annot_timestamp)
            print(
                colored(
                    "\t+ Duplicate annotation already available for sample %s [%s]"
                    % (sample, read_time), 'green'))

            ## add info for each
            dup_annot_df = HCGB_main.get_data(dup_annot_file, ',',
                                              "index_col=0")
            annot_table = HCGB_main.get_data(annot_table_file, ',',
                                             "index_col=0")
            data2add_entry = dup_searcher.get_dup_stats(
                sample, dup_annot_df, annot_table, arg_dict.debug)

        ## add genome length data
        data2add_entry['genome_len'] = ''
        len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv')
        if os.path.isfile(len_df_file):
            len_data = HCGB_main.get_data(len_df_file, ',', "header=None")
            data2add_entry['genome_len'] = len_data[1].sum()

        ## merge data
        #data2add_entry = data2add_entry.reset_index()
        data2add = data2add.append(data2add_entry, ignore_index=False)

    ### report generation
    HCGB_aes.boxymcboxface("Summarizing duplicated search")
    outdir_report = HCGB.functions.files_functions.create_subfolder(
        "report", outdir)
    dups_report = HCGB.functions.files_functions.create_subfolder(
        "dups", outdir_report)

    ## add data2add
    data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'),
                    index=True,
                    header=True)

    ## maybe add a summary of the files?

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting search module.")
    return ()

예제 #12

0

파일 보기

파일: report_generation.py 프로젝트: HCGB-IGTP/BacterialTyper

def Saureus_specific(samples_df, samples_info, options, folder, outdir_dict):
    """
    Retrieves Saureus specific information.
    
    See additional information in :doc:`../../user_guide/report/Saureus/saureus_report`
    """

    ########################################
    ## get European Quality Control genes
    ########################################
    Staphylococcus_path = os.path.abspath(
        os.path.join(os.path.realpath(__file__), '..', '..', 'report',
                     'Staphylococcus'))
    EQC_genes = os.path.join(Staphylococcus_path, "EQC_genes.csv")
    arcA_gene = os.path.join(Staphylococcus_path, "arcA.fasta")

    EQC_genes_df = HCGB_main.get_data(EQC_genes, ',', '')
    ## Gene,ID,Source
    ## mecA,ARO:3000617,CARD
    ## mecC,ARO:3001209,CARD
    ## mupA,ARO:3000521,CARD

    ## debugging messages
    if options.debug:
        HCGB_aes.debug_message("Saureus_specific", 'yellow')
        print(Staphylococcus_path)
        print(arcA_gene)

        HCGB_aes.debug_message("EQC_genes", 'yellow')
        print(EQC_genes)
        print(EQC_genes_df)

    ####################
    ## get gene info by unique ID
    ####################
    ## get gene names
    gene_IDs = EQC_genes_df['ID'].to_list()

    ## outdir_dict
    #results_Profiles_ids = retrieve_genes.get_genes_profile(samples_info, gene_IDs, options.debug, 'ID')
    if options.debug:
        HCGB_aes.debug_message("results_Profiles_ids", 'yellow')
    #    print (results_Profiles_ids)

    ########################################
    ## add additional genes if required
    ########################################
    if options.genes_ids_profile:
        in_file = os.path.abspath(options.genes_ids_profile)
        gene_names = [line.rstrip('\n') for line in open(in_file)]

        if options.debug:
            print("gene_names")
            print(gene_names)

        ## outdir_dict
        results_Profiles_names = retrieve_genes.get_genes_profile(
            samples_info, gene_names, options.debug, 'name')
        if options.debug:
            print("results_Profiles")
            print(results_Profiles)

    #################################
    ## get blast sequence         ###
    #################################
    # arcA_gene

    ####################
    ## get spatyping  ##
    ####################
    samples_df = samples_df.set_index('name')
    assembly_files = samples_df.loc[samples_df['tag'] == "assembly", "sample"]
    results_spaType = pd.DataFrame()
    #results_spaType = get_spa_typing.module_call(options.database, assembly_files.to_dict(), options.debug)

    ####################
    ## get agr typing
    ####################
    ##
    agr_results = agr_typing.agrvate_caller(assembly_files.to_dict(),
                                            outdir_dict, options.debug)

    ## copy excel file and operon into report folder
    ## remove from dataframe

    ####################
    ## get sccmec
    ####################
    ## todo

    ####################
    ## save results
    ####################
    ## open excel writer
    name_excel = folder + '/Saureus_report.xlsx'
    writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')

    # results_Profiles ids
    results_Profiles_ids.to_excel(writer, sheet_name="gene_ids")

    if options.genes_ids_profile:
        # results_Profiles names
        results_Profiles_ids.to_excel(writer, sheet_name="gene_names")

    # results_spaType
    results_spaType.to_excel(writer, sheet_name="spaTyper")

    # agr_results
    agr_results.to_excel(writer, sheet_name="agr typing")

    ## close
    writer.save()

예제 #13

0

파일 보기

파일: virulence_resistance.py 프로젝트: HCGB-IGTP/BacterialTyper

def parse_card(folder, sampleName, fileResults, fileFlags, summary,
               assembly_cutoff, card_trick_info):
    ##
    ## Parses results from CARD database.
    ## Input is a folder for output results, sample name, and
    ## fileResults is report.tsv generated by ariba
    ## fileFlags was generated by results_parser calling ariba expand flags,
    ## summary is the summary file generated by ariba summary
    ##

    ## get data
    summary_data = pd.read_csv(
        summary, header=0,
        sep=',')  ## report_summary.csv :: parse information from ARIBA
    fileFlags_data = pd.read_csv(
        fileFlags, header=0,
        sep='\t')  ## flags_explain.tsv :: ariba expand flag: explained flags
    original_data = pd.read_csv(
        fileResults, header=0,
        sep='\t')  ## report.tsv :: ariba report generated
    card_ontology = HCGB_main.get_data(
        card_trick_info + '/aro.obo.csv', ',',
        'index_col=0')  ## read card_info generated for card_trick parse

    ## summary data
    summary_data = summary_data.set_index('name')
    list_found_genes = summary_data.columns
    cluster_len = len(list_found_genes)

    ## print info
    print("\tCheck CARD result: ", sampleName)
    print(
        "\t%s genes putatively involved in resistance to some antibiotics..." %
        cluster_len)

    ## subset
    data = original_data.loc[original_data['cluster'].isin(
        summary_data.columns)]

    ############################################################################
    ## analyze each cluster confering resistance
    ############################################################################
    colnames = [
        'Reference', 'ID', 'Protein-coding', 'Presence/Absence', 'Variants',
        'Description', 'Additional information'
    ]

    ## get results: conferring resistance
    df_results = found_results(colnames, data, list_found_genes, 'CARD')

    ## get results: found, identified, partial
    df_identified = identified_results(original_data, "CARD", list_found_genes,
                                       assembly_cutoff)

    ############################################################################
    ## use card-trick python package to get ontology for each term
    AROS_identified = list(df_identified['ID'])
    information_ontology = card_trick_caller.get_info_CARD(
        AROS_identified, 'ARO', card_ontology)

    ##########################
    ## generate excel sheet
    ##########################

    ## open excel handle
    name_excel = folder + '/' + sampleName + '_CARD_results.xlsx'
    writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')

    ## write excel handle
    df_results.to_excel(writer, sheet_name='results')  ## write results
    df_identified.to_excel(
        writer,
        sheet_name='identified')  ## Identified genes: ARIBA flags explained
    information_ontology.to_excel(writer,
                                  sheet_name='CARD_ontology')  ## CARD ontology
    original_data.to_excel(
        writer, sheet_name='ARIBA_report')  ## Original data from ARIBA
    summary_data.to_excel(
        writer, sheet_name='ARIBA_summary')  ## ARIBA summary generated
    fileFlags_data.to_excel(writer,
                            sheet_name='flags')  ## ARIBA flags explained

    name_csv = folder + '/' + sampleName + '_CARD_summary.csv'
    df_identified.to_csv(name_csv)

    ## close excel handle
    writer.save()

    return (name_excel, name_csv)

예제 #14

0

파일 보기

파일: database_user.py 프로젝트: HCGB-IGTP/BacterialTyper

def update_database_user_data(database_folder, project_folder, Debug, options):
    """
	Updates user_data folder within the database folder provided.
	
	It would generate single subfolder for each sample previously analyzed and it would store main information and result files for later interpretation, comparison and/or summarization with new samples analyzed.
	
	:param database_folder:
	:param project_folder:
	:param Debug:
	:param options:
	
	:type database_folder:
	:type project_folder:
	:type Debug:
	:type options:
	
	:returns: Updated database result from :func:`BacterialTyper.scripts.database_generator.update_db_data_file`.
	:rtype: Dataframe
	
	:warnings: Returns **FAIL** if check process failed.
	
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.files_functions.create_subfolder`
		
		- :func:`HCGB.functions.main_functions.functions.get_data`
		
		- :func:`HCGB.functions.main_functions.optimize_threads`
		
		- :func:`BacterialTyper.scripts.database_user.get_userData_files`
		
		- :func:`BacterialTyper.scripts.database_user.update_sample`
		
		- :func:`BacterialTyper.scripts.database_generator.getdbs`
		
		- :func:`BacterialTyper.scripts.database_generator.get_database`
		
		- :func:`BacterialTyper.scripts.database_generator.update_db_data_file`

	"""

    print("\n+ Updating information from user data folder: ", project_folder)

    ## create folder
    own_data = HCGB_files.create_subfolder("user_data", database_folder)

    ## Default missing options
    options.project = True
    options.debug = Debug
    if not options.single_end:
        options.pair = True

    ####################################
    ## get information
    ####################################

    ## get user data files
    project_data_df = get_userData_files(options, project_folder)

    ## get user data info
    project_info_df = get_userData_info(options, project_folder)

    ## merge data
    project_all_data = pd.concat([project_data_df, project_info_df],
                                 join='outer',
                                 sort=True).drop_duplicates()
    #project_all_data.index.name = 'name'

    ## debug messages:
    if Debug:
        HCGB_aes.debug_message("project_data_df", 'yellow')
        print(project_data_df)

        HCGB_aes.debug_message("project_info_df", 'yellow')
        print(project_info_df)

        HCGB_aes.debug_message("project_all_data", 'yellow')
        print(project_all_data)

    print('\n+ Get database information')
    db_frame = database_generator.getdbs('user_data', database_folder,
                                         'user_data', Debug)
    user_data_db = database_generator.get_database(db_frame, Debug)

    ## merge dataframe
    sample_frame = project_all_data.groupby("name")

    ####################################
    ## optimize threads
    ####################################
    name_list = project_all_data.index.values.tolist()
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print('\n+ Updating information using %s threads and %s parallel jobs' %
          (options.threads, max_workers_int))

    ####################################
    ## loop through frame using multiple threads
    ####################################
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        ## send for each
        commandsSent = {
            executor.submit(update_sample, name, cluster, own_data,
                            user_data_db, Debug): name
            for name, cluster in sample_frame
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    HCGB_aes.print_sepLine("+", 75, False)
    print("\n+ Retrieve information ...")

    ####################################
    ###### populate dataframe
    ####################################
    for name, cluster in sample_frame:
        ###### dump to file
        info_file = own_data + '/' + name + '/info.txt'
        if os.path.exists(info_file):
            dataGot = HCGB_main.get_data(info_file, ',', 'index_col=0')
            dataGot = dataGot.set_index('ID')

            if (options.debug):
                print(colored("**DEBUG: dataGot dataframe **", 'yellow'))
                print(dataGot)

            user_data_db = pd.concat([user_data_db, dataGot],
                                     join='outer',
                                     sort=True).drop_duplicates()
            ## concatenating by outer we get all available entries

    if (options.debug):
        print(colored("**DEBUG: user_data_db dataframe **", 'yellow'))
        print(user_data_db)

    HCGB_aes.print_sepLine("+", 75, False)

    ####################################
    ## update db
    ####################################
    database_csv = own_data + '/user_database.csv'

    dataUpdated = database_generator.update_db_data_file(
        user_data_db, database_csv)
    print("+ Database has been generated: \n", database_csv)
    return (dataUpdated)

예제 #15

0

파일 보기

파일: set_config.py 프로젝트: HCGB-IGTP/BacterialTyper

def get_R_packages():
    dep_file = os.path.abspath(
        os.path.join(os.path.dirname(__file__), 'R', 'R_dependencies.csv'))
    dep_file_data = HCGB_main.get_data(dep_file, ',', 'index_col=0')
    return (dep_file_data)

예제 #16

0

파일 보기

파일: set_config.py 프로젝트: HCGB-IGTP/BacterialTyper

def check_perl_packages(file_name, Debug, option_install, install_path):
    """
	Check the perl packages required

	This functions checks whether the packages installed in the system fulfilled the 
	minimum version specified in the configuration file. Details of the perl packages 
	required are available in :file:`BacterialTyper/config/perl/`. 

	It uses function :func:`BacterialTyper.config.set_config.get_perl_packages` to
	retrieve the version of the perl packages installed in the system. Then it uses
	:func:`BacterialTyper.config.extern_progs.min_perl_package_version` to retrieve the minimum
	version specified. It compares them using function :func:`BacterialTyper.config.set_config.check_install_module`.

	:param file_name: Name of the file to search within :file:`BacterialTyper/config/perl/`.
	:param Debug: True/False for debugging messages
	:param option_install: True/False for installing missing dependencies
	:param install_path: Install path for installing modules.

	:type file_name: string
	:type Debug: boolean
	:type option_install: boolean
	:type install_path: string

	:returns: Print messages if packages are installed.

	.. seealso:: This function relies on other ``BacterialTyper`` functions:

		- :func:`BacterialTyper.config.set_config.get_perl_packages`

		- :func:`BacterialTyper.config.set_config.check_install_module`

		- :func:`BacterialTyper.config.extern_progs.min_perl_package_version`

		- :func:`BacterialTyper.config.install_dependencies.perl_package_install`

	"""
    ## get perl packages installed
    my_packages_installed = get_perl_packages(Debug, file_name)

    ## debug messages
    if (Debug):
        print("my_packages_installed :: ")
        print(my_packages_installed)

    ## min versions for packages
    my_packages_requirements = extern_progs.min_perl_package_version(file_name)

    ## debug messages
    if (Debug):
        print("my_packages_requirements")
        print(my_packages_requirements)

    ## get info for perl modules
    perl_lib_dependecies_file = extern_progs.file_list(file_name)
    perl_lib_dependecies = HCGB_main.get_data(perl_lib_dependecies_file, ',',
                                              'index_col=0')

    ## check each package
    for each in my_packages_requirements:
        ## get min version
        min_version = my_packages_requirements[each]

        ## get version installed in system
        installed = my_packages_installed[each]

        ## module name conversion
        module_name = perl_lib_dependecies.loc[each, 'module']

        ## debug messages
        if (Debug):
            print("Module:", each)
            print("Module name:", module_name)
            print("Min_Version:", min_version)
            print("Version installed:", installed)

        ## check if installed
        message = check_install_module(installed, module_name, min_version,
                                       'Package')

        if (message == 'OK'):
            continue
        else:
            print(
                colored(
                    "** ATTENTION: Installation of perl modules is not supported",
                    'red'))
            print("+ Please install manually package: ", module_name,
                  " to continue with BacterialTyper\n\n")

예제 #17

0

파일 보기

def run_database(options):

    ## init time
    start_time_total = time.time()
    start_time_partial = start_time_total

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
        print("[Debug mode: ON]")
    else:
        Debug = False

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Database")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    kma_bin = set_config.get_exe("kma")

    ######################################################
    ## print further information if requested
    if (options.help_ARIBA):
        print("ARIBA databases information:")
        ariba_caller.help_ARIBA()
        exit()

    elif (options.help_BUSCO):
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_KMA):
        species_identification_KMA.help_kma_database()
        exit()
    ######################################################

    ## create folder
    ## absolute
    options.path = os.path.abspath(options.path)
    HCGB_files.create_folder(options.path)

    #########
    if Debug:
        print(colored("DEBUG: absolute path folder: " + options.path,
                      'yellow'))

    ##########
    ## NCBI	##
    ##########
    ## if any NCBI options provided
    if any([options.ID_file, options.descendant]):
        ## create folders
        NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path)
        if (options.ID_file):
            ## get path and check if it is file
            abs_path_file = os.path.abspath(options.ID_file)
            if os.path.isfile(abs_path_file):
                print()
                HCGB_aes.print_sepLine("*", 50, False)
                print("--------- Check NCBI ids provided ---------\n")
                HCGB_aes.print_sepLine("*", 70, False)
                ## get file information
                print("\t+ Obtaining information from file: %s" %
                      abs_path_file)
                strains2get = HCGB_main.get_data(abs_path_file, ',', '')
                dataBase_NCBI = database_generator.NCBI_DB(
                    strains2get, NCBI_folder, Debug)

                #########
                if Debug:
                    print(colored("DEBUG: NCBI data provided: ", 'yellow'))
                    print(options.ID_file)

                ## functions.timestamp
                start_time_partial = HCGB_time.timestamp(start_time_partial)
                ## strains downloaded would be included to a kma index

        ## Get all entries belonging to this taxon provided
        if (options.descendant):
            #########
            if Debug:
                print(colored("DEBUG: NCBI descendant option: ON ", 'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print(
                "--------- Check descendant NCBI taxonomy ids provided ---------\n"
            )
            HCGB_aes.print_sepLine("*", 70, False)
            ## [TODO]
            dataBase_NCBI = database_generator.NCBI_descendant(
                options.descendant, NCBI_folder, Debug)

        ##############################################################
        ## update KMA database with NCBI information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_NCBI['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db)

        print('+ Database to update: ', genbank_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'genbank_KMA',
                                               genbank_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ###############
    ## user_data ##
    ###############
    if options.project_folder:

        ##
        dataBase_user = pd.DataFrame()
        ## get absolute path
        abs_project_folder = os.path.abspath(options.project_folder)
        if os.path.exists(abs_project_folder):
            #########
            if Debug:
                print(
                    colored("DEBUG: User provides folder containing project",
                            'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print("--------- Check user provided project folder ---------")
            HCGB_aes.print_sepLine("*", 70, False)
            dataBase_user = database_user.update_database_user_data(
                options.path, abs_project_folder, Debug, options)
        else:
            print(
                colored(
                    "ERROR: Folder provided does not exists: %s" %
                    options.project_folder, 'red'))
            exit()

        ##############################################################
        ## update KMA database with user_data information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_user['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        user_kma_db = HCGB_files.create_subfolder('user_data', kma_db)

        print('+ Database to update: ', user_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'userData_KMA',
                                               user_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ##########
    ## ARIBA
    ##########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check ARIBA parameters provided --------")
    HCGB_aes.print_sepLine("*", 50, False)
    if (options.no_ARIBA):
        print("+ No ARIBA databases would be downloaded...")

        #########
        if Debug:
            print(colored("DEBUG: No option ARIBA", 'yellow'))

    else:
        #functions.print_sepLine("*",50, False)

        ### ariba list databases
        ariba_dbs_list = ['CARD', 'VFDB']

        if (options.no_def_ARIBA):
            ariba_dbs_list = options.ariba_dbs
        else:
            if (options.ariba_dbs):
                ariba_dbs_list = ariba_dbs_list + options.ariba_dbs
                ariba_dbs_list = set(ariba_dbs_list)

        #########
        if Debug:
            print(colored("DEBUG: Option ARIBA", 'yellow'))
            print(options.ariba_dbs)

        ariba_caller.download_ariba_databases(ariba_dbs_list, options.path,
                                              Debug, options.threads)

        ### ariba list databases
        if (options.ariba_users_fasta):
            print(
                "+ Generate ARIBA database for databases provided: prepare fasta and metadata information"
            )

            #########
            if Debug:
                print(colored("DEBUG: Option user ARIBA db", 'yellow'))
                print(ariba_users_fasta)
                print(ariba_users_meta)

            ## [TODO]:
            ## ariba prepareref fasta and metadata

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    #########
    ## kma ##
    #########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check KMA parameters provided ----------")
    kma_database = options.path + '/KMA_db'
    HCGB_files.create_folder(kma_database)

    ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains
    ## downloads all "bacterial" genomes from KMA website
    ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/

    print(
        "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website"
    )

    ## KMA databases to use
    ## only user dbs
    if (options.no_def_kma):
        if (options.kma_dbs):
            print("+ Only user databases selected will be indexed...")
        else:
            print("+ No databases selected.")
            print(colored("ERROR: Please select a kma database.", 'red'))
            exit()

    ## default dbs + user
    else:
        kma_dbs = ["bacteria", "plasmids"]

        ## default dbs + user
        if (options.kma_dbs):
            options.kma_dbs = options.kma_dbs + kma_dbs
            options.kma_dbs = set(options.kma_dbs)
        else:
            options.kma_dbs = kma_dbs

    #########
    if Debug:
        print(colored("DEBUG: options.kma_dbs", 'yellow'))
        print(options.kma_dbs)

    ## Get databases
    for db in options.kma_dbs:
        print(colored("\n+ " + db, 'yellow'))
        db_folder = HCGB_files.create_subfolder(db, kma_database)
        species_identification_KMA.download_kma_database(db_folder, db, Debug)

    ### timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ###########
    ## BUSCO ##
    ###########
    if (options.BUSCO_dbs):
        print()
        HCGB_aes.print_sepLine("*", 50, False)
        print("--------- Check BUSCO datasets provided ---------")
        BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path)

        #########
        if Debug:
            print(colored("DEBUG: options.BUSCO_dbs", 'yellow'))
            print(options.BUSCO_dbs)

        print("+ BUSCO datasets would be downloaded when executed...")
        #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder)

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    print("\n*************** Finish *******************\n")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Database module.\n")
    return ()