def BUSCO_check_dataset(folder, name): config_file = folder + '/dataset.cfg' ## name=bacteria_odb9 ## species=E_coli_K12 ## domain=prokaryota ## creation_date=2016-11-01 ## number_of_BUSCOs=148 ## number_of_species=3663 if os.path.isdir(folder): #print ("+ Checking the integrity of BUSCO dataset in folder: ", folder) HCGB_aes.print_sepLine("+", 10, False) print ("Statistics for dataset: ") HCGB_aes.print_sepLine("+", 10, False) if os.path.isfile(config_file): list_config = HCGB_main.readList_fromFile(config_file) for elem in list_config: line = elem.split("=") line[0] = line[0].replace("_", " ") print (" "+ "\t".join(line)) print() print ("Available in folder: ", folder) print (colored("Dataset....[ OK ]\n", 'green')) else: print (colored("Dataset....[ FAIL ]\n", 'red')) print ("+ Removing dataset to avoid further errors:") os.rmdir(folder) return ('FAIL')
def R_package_path_installed(): """Provides absolute path to file ``R_package.info.txt`` containing path to missing R packages installed""" ## check if exists or try to install RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'R', 'R_package.info.txt') if HCGB_files.is_non_zero_file(RDir_package): list = HCGB_main.readList_fromFile(RDir_package) return (list[0]) else: path2install = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'R', 'install_packages') HCGB_files.create_folder(path2install) return (path2install)
def dead_code(): ## card_prepareref rename_info = card_prepareref + '00.rename_info' ## outfile outfile = card_prepareref + '00.info_dictionary' out_file_handle = open(outfile, 'w') ## get info lines = HCGB_main.readList_fromFile(rename_info) for l in lines: names = l.split('\t') ## original name \t ariba_name out_file_handle.write(names[1].split('.')[0] + '\t' + names[0].split('.')[0] + '\n') out_file_handle.close() return (outfile)
def run_cluster(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_Mash): ## information for Min Hash Software min_hash_caller.helpMash() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Clustering samples") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files if options.reads: if options.noTrim: ## raw reads pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: ## trimm reads pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## keep only R1 reads if paired-end if options.pair: pd_samples_retrieved = pd_samples_retrieved.loc[ pd_samples_retrieved['read_pair'] == "R1"] else: ## default pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) # exit if empty if pd_samples_retrieved.empty: print( "No data has been retrieved from the project folder provided. Exiting now..." ) exit() ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for each sample outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "mash", options.debug) ## debug message if (Debug): print(colored("**DEBUG: outdir_dict **", 'yellow')) print(outdir_dict) ## get databases to check retrieve_databases = get_options_db(options) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## remove samples if specified if options.ex_sample: ex_samples = HCGB_main.get_info_file(options.ex_sample) retrieve_databases = retrieve_databases.loc[~retrieve_databases.index. isin(ex_samples)] ## debug message if (Debug): print(colored("**DEBUG: retrieve_database **", 'yellow')) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) print(retrieve_databases) ## check if all samples in user_data or genbank are indexed siglist_all = [] for index, row in retrieve_databases.iterrows(): if not row['path'] == 'NaN': if (Debug): HCGB_aes.print_sepLine("*", 25, False) print(row) if all([ int(options.kmer_size) == int(row['ksize']), int(options.n_sketch) == int(row['num_sketch']) ]): siglist_all.append( min_hash_caller.read_signature(row['path'], options.kmer_size)) continue ## index assembly or reads... (sigfile, siglist) = generate_sketch(row['folder'], row['original'], index, options.kmer_size, options.n_sketch, Debug) retrieve_databases.loc[index]['path'] = sigfile retrieve_databases.loc[index]['ksize'] = options.kmer_size retrieve_databases.loc[index]['num_sketch'] = options.n_sketch siglist_all.append(siglist) ### Cluster project samples print(colored("\n+ Collect project data", 'green')) print("+ Generate mash sketches for each sample analyzed...") pd_samples_retrieved = pd_samples_retrieved.set_index('name') ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieved **", 'yellow')) print(pd_samples_retrieved) ## init dataframe for project data colname = ["source", "name", "path", "original", "ksize", "num_sketch"] pd_samples_sketched = pd.DataFrame(columns=colname) for index, row in pd_samples_retrieved.iterrows(): if index in retrieve_databases.index: print( colored( '\t+ Sketched signature (%s) available within user data...' % index, 'yellow')) continue this_sig = outdir_dict[index] + '/' + index + '.sig' if os.path.exists(this_sig): ## File signature might exist ## read original file2print = outdir_dict[index] + '/.original' if not os.path.exists(file2print): original = ['NaN'] else: original = HCGB_main.readList_fromFile(file2print) if all([ int(options.kmer_size) == int(original[1]), int(options.n_sketch) == int(original[2]) ]): siglist_all.append( min_hash_caller.read_signature(this_sig, options.kmer_size)) pd_samples_sketched.loc[len(pd_samples_sketched)] = ( 'project_data', index, this_sig, row['sample'], options.kmer_size, options.n_sketch) print( colored( '\t+ Sketched signature available (%s) in project folder...' % index, 'green')) continue print( colored('\t+ Sketched signature to be generated: (%s)...' % index, 'yellow')) ## index assembly or reads... (sigfile, siglist) = generate_sketch(outdir_dict[index], row['sample'], index, options.kmer_size, options.n_sketch, Debug) pd_samples_sketched.loc[len(pd_samples_sketched)] = ('project_data', index, sigfile, row['sample'], options.kmer_size, options.n_sketch) siglist_all.append(siglist) print("\n+ Clustering sequences...") pd_samples_sketched = pd_samples_sketched.set_index('name') #### if retrieve_databases.empty: cluster_df = pd_samples_sketched else: tmp = retrieve_databases[[ 'source', 'db', 'path', 'original', 'ksize', 'num_sketch' ]] tmp = tmp.rename(columns={'db': 'name'}) tmp.set_index('name') if (Debug): print(colored("**DEBUG: tmp **", 'yellow')) print(tmp) ## merge both dataframes cluster_df = pd.concat([pd_samples_sketched, tmp], join='inner', sort=True) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_sketched **", 'yellow')) print(pd_samples_sketched) print(colored("**DEBUG: cluster_df **", 'yellow')) print(cluster_df) print(colored("**DEBUG: Signatures **", 'yellow')) print(siglist_all) print(colored("**DEBUG: length siglist_all **", 'yellow')) print(len(siglist_all)) ## Assign Colors colorLabels color_df = cluster_df.filter(["source"], axis=1) color_df["color"] = "r" ## red::genbank ## project data project_data = list(color_df[color_df["source"] == "project_data"].index) color_df.loc[color_df.index.isin(project_data), "color"] = "g" ## green::project_data ## user_data user_data = list(color_df[color_df["source"] == "user_data"].index) color_df.loc[color_df.index.isin(user_data), "color"] = "b" ## blue::user_data colorLabels = color_df['color'].to_dict() if Debug: print(color_df) print(colorLabels) ## parse results if options.project: outdir_report = HCGB_files.create_subfolder("report", outdir) #final_dir = outdir + '/report/cluster' final_dir = functions.create_subfolder("cluster", outdir_report) else: final_dir = outdir ## compare name = 'cluster_' + str(HCGB_time.create_human_timestamp()) tag_cluster_info = final_dir + '/' + name print('+ Saving results in folder: ', final_dir) print('\tFile name: ', name) (DataMatrix, labeltext) = min_hash_caller.compare(siglist_all, tag_cluster_info, Debug) ## get colorLabels ## plot images pdf = True cluster_returned = min_hash_caller.plot(DataMatrix, labeltext, tag_cluster_info, pdf, colorLabels) ## generate newick tree min_hash_caller.get_Newick_tree(cluster_returned, DataMatrix, labeltext, tag_cluster_info) return ()
def get_options_db(options): ## ## Among all databases available and according to the input options, ## select the databases to use and set dataframe with this information ## print("\n\n+ Select databases to use for identification:") ### database folder to use database2use = os.path.abspath(options.database) ## debug message if (Debug): print( colored("**DEBUG: Database to use: " + database2use + " **", 'yellow')) ## external file provided: single or batch if (options.external_file): abs_path_ext_file = os.path.abspath(options.external_file) if options.batch_external: myList = HCGB_main.readList_fromFile(abs_path_ext_file) join_str = ','.join(myList) else: join_str = abs_path_ext_file ############################################################ ### Options: according to user input: select databases to use option_db = "" ############ ## 1) only user data: previously identified and added ############ if (options.user_data): option_db = "Mash:user_data" ############ ## 2) only genbank data: previously download from NCBI reference genomes ############ elif (options.genbank_data): option_db = "Mash:genbank" ############ ## 3) only project_data ############ elif (options.only_project_data): option_db = "Mash:project_data" pd_MASH = pd.DataFrame() return (pd_MASH) ############ ## 4) only external_data ############ elif (options.only_external_data): option_db = "Mash_external_data:" + join_str ################# ## all databases ################# else: ############################# option_db = 'Mash:user_data#Mash:genbank' if (options.external_file): option_db = option_db + '#Mash_external_data:' + join_str ############### ### get dbs ############### print("\n+ Parsing information to retrieve databases") print("+ Reading from database: " + database2use) HCGB_aes.print_sepLine("-", 50, False) ############### ## debug message if (Debug): print(colored("**DEBUG: option_db: " + option_db + " **", 'yellow')) pd_MASH = database_generator.getdbs("MASH", database2use, option_db, Debug) HCGB_aes.print_sepLine("-", 50, False) ## return both dataframes return (pd_MASH)
def getdbs(source, database_folder, option, debug): """Get databases available within the folder provided. :param source: Type of database to search: ARIBA, KMA, NCBI, MLST, user_data :param database_folder: Absolute path to database folder. :param option: String containing multiple entries separated by '#' that indicate the type of database entries to search within each source type. :param debug: True/False for debugging messages. :type source: string :type database_folder: string :type option: string :type debug: bool :returns: Dataframe containing absolute paths to the available databases for each type requested. It contains columns for: "source", "db", "path" e.g.: source = KMA option = kma:archaea,plasmids,bacteria#kma_external:/path/to/file1,/path/to/file2#user_data#genbank ** e.g.: source = NCBI option = genbank """ ## init dataframe colname = ["source", "db", "path"] db_Dataframe = pd.DataFrame(columns=colname) ## read folders within database if os.path.isdir(database_folder): files = os.listdir(database_folder) ## ARIBA/KMA_db/genbank/user_data else: return db_Dataframe ## debug message if (debug): print(colored("Folders: " + str(files), 'yellow')) print() ## user input dbs2use = [] option_list = option.split("#") for option_item in option_list: ## debug message if (debug): print(colored("Option item: " + option_item, 'yellow')) ### dbs2use_tmp = [] ## kma if (option_item.startswith('kma')): if (option_item.startswith('kma:')): dbs2use_tmp = option_item.split(":")[1].split(",") elif (option_item.startswith('kma_external:')): external = option_item.split(":")[1].split(",") ## add to dataframe for ext in external: name_ext = os.path.basename(ext) db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_External', name_ext, ext ] elif (option_item.startswith('kma_user_data:')): dbs2use_tmp = option_item.split(":")[1].split(",") elif (option_item.startswith('kma_NCBI:')): dbs2use_tmp = option_item.split(":")[1].split(",") ### ARIBA elif (option_item.startswith('ARIBA:')): dbs2use = option_item.split(":")[1].split(",") ### NCBI: genbank elif (option_item.startswith('genbank')): dbs2use.append('genbank') ### NCBI: taxonomy ID elif (option_item.startswith('tax_id')): dbs2use.append('taxonomy_id') ### user_data elif (option_item.startswith('user_data')): dbs2use.append('user_data') ### MLST elif (option_item.startswith('MLST')): dbs2use_tmp = option_item.split(":")[1].split(",") ### Mash elif (option_item.startswith('Mash')): if (option_item.startswith('Mash_external_data:')): external = option_item.split(":")[1].split(",") ## add to dataframe for ext in external: name_ext = os.path.basename(ext) name_ext_ = name_ext.split('.fna')[0] db_Dataframe.loc[len(db_Dataframe)] = [ 'Mash_external', name_ext_, ext ] else: dbs2use_tmp = option_item.split(":")[1].split(",") ### Other? else: dbs2use.append( option_item ) ## add ARIBA, user_data or genbank option if provided ## get all dbs2use = dbs2use + dbs2use_tmp ## debug message if (debug): print(colored("\ndbs2use:\n\t" + "\n\t".join(dbs2use), 'yellow')) ## init dataframe #colname = ["source", "db", "path"] #db_Dataframe = pd.DataFrame(columns = colname) ############### #### ARIBA #### ############### if (source == 'ARIBA'): ### Check if folder exists ARIBA_folder = HCGB_files.create_subfolder('ARIBA', database_folder) ### get information ARIBA_dbs = ariba_caller.get_ARIBA_dbs(dbs2use) ## get names for ariba_db in ARIBA_dbs: this_db = os.path.join(ARIBA_folder, ariba_db + '_prepareref') if os.path.exists(this_db): code_check_db = ariba_caller.check_db_indexed(this_db, 'NO') if (code_check_db == True): db_Dataframe.loc[len(db_Dataframe)] = [ 'ARIBA', ariba_db, this_db ] print( colored( "\t- ARIBA: including information from database: " + ariba_db, 'green')) else: print("+ Database: ", ariba_db, " is not downloaded...") print("+ Download now:") folder_db = HCGB_files.create_subfolder(ariba_db, ARIBA_folder) code_db = ariba_caller.ariba_getref(ariba_db, folder_db, debug, 2) ## get names if (code_db == 'OK'): db_Dataframe.loc[len(db_Dataframe)] = [ 'ARIBA', ariba_db, this_db ] print( colored( "\t- ARIBA: including information from database: " + ariba_db, 'green')) ############# #### KMA #### ############# elif (source == 'KMA'): ### Check if folder exists KMA_db_abs = HCGB_files.create_subfolder('KMA_db', database_folder) kma_dbs = os.listdir(KMA_db_abs) ## debug message if (debug): print(colored("Folders KMA_db:" + str(kma_dbs), 'yellow')) ### get information for db in dbs2use: this_db = KMA_db_abs + '/' + db ## debug message if (debug): print(colored("this_db:" + this_db, 'yellow')) #### genbank if (db == "genbank"): ## KMA databases exists this_db_file = this_db + '/genbank_KMA' if os.path.isfile(this_db_file + '.comp.b'): print( colored( "\t- genbank: including information from different reference strains available.", 'green')) ## include data from NCBI db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_genbank', 'genbank', this_db_file ] #### user_data elif (db == "user_data"): ## KMA databases exists this_db_file = this_db + '/userData_KMA' if os.path.isfile(this_db_file + '.comp.b'): print( colored( "\t- user_data: including information from user previously generated results", 'green')) ## include user data db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_user_data', 'user_data', this_db_file ] ## default KMA databases: bacteria & plasmids else: ## if (db == 'plasmids'): prefix = '.T' elif (db == 'viral'): prefix = '.TG' else: prefix = '.ATG' this_db_file = os.path.join(this_db, db, db + prefix) ## debug message if (debug): print(colored("this_db_file:" + this_db_file, 'yellow')) if os.path.isfile(this_db_file + '.comp.b'): db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_db', db, this_db_file ] print( colored( "\t- KMA: including information from database " + db, 'green')) else: print( colored("\t**KMA: Database %s was not available." % db, 'red')) ## if missing: call download module print("+ Download missing KMA_db (%s) provided" % db) species_identification_KMA.download_kma_database( os.path.join(database_folder, 'KMA_db', db), db, debug) if os.path.isfile(this_db_file + '.comp.b'): db_Dataframe.loc[len(db_Dataframe)] = [ 'KMA_db', db, this_db_file ] print( colored( "\t- KMA: including information from database " + db, 'green')) else: print( colored( "\t**KMA: Database %s was not available." % db, 'red')) ############## #### NCBI #### ############## elif (source == 'NCBI'): ## TODO: get additional information from ## info_file = dir_path + '/info.txt' ### Check if folder exists path_genbank = os.path.join(database_folder, source, 'genbank') db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder) ### genbank entries downloaded if dbs2use[0] == 'genbank': ## if os.path.exists(path_genbank + '/bacteria'): genbank_entries = os.listdir( os.path.join(path_genbank, 'bacteria')) for entry in genbank_entries: this_db = os.path.join(path_genbank, 'bacteria', entry) db_Dataframe.loc[len(db_Dataframe)] = [ 'NCBI:genbank', entry, this_db ] elif dbs2use[0] == 'tax_id': tax_id_entries = db2use_abs ################### #### user_data #### ################### elif (source == 'user_data'): ### Check if folder exists db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder) user_entries = os.listdir(db2use_abs) for entry in user_entries: this_db = db2use_abs + '/' + entry db_Dataframe.loc[len(db_Dataframe)] = ['user_data', entry, this_db] ################# #### PubMLST #### ################# elif (source == 'MLST'): ### get information for db in dbs2use: if db == 'PubMLST': ### Check if folder exists db2use_abs = HCGB_files.create_subfolder( 'PubMLST', database_folder) list_profiles = os.listdir(db2use_abs) for entry in list_profiles: this_db = db2use_abs + '/' + entry db_Dataframe.loc[len(db_Dataframe)] = [ 'MLST', 'PubMLST', entry + ',' + this_db ] print( colored( "\t- MLST: including information from profile: " + entry, 'green')) else: db_Dataframe.loc[len(db_Dataframe)] = [ 'MLST', 'user_profile', db ] print( colored( "\t- MLST: including information from profile provided by user: "******"genbank"): ### Check if folder exists db2use_abs = database_folder + '/NCBI/genbank/bacteria' if os.path.exists(db2use_abs): print( colored( "\n\t- genbank: including information from different reference strains available.", 'green')) ## include data from NCBI genbank_entries = os.listdir(db2use_abs) for entry in genbank_entries: print('\t+ Reading information from sample: ', entry) this_db = db2use_abs + '/' + entry ## get additional information from info_file = this_db + '/info.txt' info_data = pd.read_csv(info_file).set_index('ID') info_data.fillna("NaN", inplace=True) ## get readable name for each strain entry_strain = str(info_data.loc[entry]['name']) if entry_strain == 'NaN': ## TODO: debug if it works entry_strain = entry print() else: print('\t\t+ Rename into: ', entry_strain) list_msh = HCGB_main.retrieve_matching_files( this_db, '.sig', debug) if (list_msh): ## print original in file file2print = this_db + '/.original' if not os.path.exists(file2print): original = ['NaN'] else: original = HCGB_main.readList_fromFile( file2print) db_Dataframe.loc[len(db_Dataframe)] = [ 'genbank', entry_strain, list_msh[0], this_db + '/mash/' + original[0], original[1], original[2], this_db ] else: ## index assembly or reads... list_fna = HCGB_main.retrieve_matching_files( this_db, 'genomic.fna', debug) ## not available db_Dataframe.loc[len(db_Dataframe)] = [ 'genbank', entry_strain, 'NaN', list_fna[0], 'NaN', 'NaN', this_db ] #### user_data elif (db == "user_data"): print( colored( "\n\t- user_data: including information from user previously generated results", 'green')) ## include user data db2use_abs = HCGB_files.create_subfolder( 'user_data', database_folder) user_entries = os.listdir(db2use_abs) for entry in user_entries: if entry == 'user_database.csv': continue print('\t+ Reading information from sample: ', entry) this_db = db2use_abs + '/' + entry this_mash_db = this_db + '/mash/' + entry + '.sig' if os.path.exists(this_mash_db): ## print original in file file2print = this_db + '/mash/.original' if not os.path.exists(file2print): original = ['NaN', 'NaN', 'NaN'] else: original = HCGB_main.readList_fromFile(file2print) ## db_Dataframe.loc[len(db_Dataframe)] = [ 'user_data', entry, this_mash_db, this_db + '/mash/' + original[0], original[1], original[2], this_db + '/mash' ] else: ## not available list_fna = HCGB_main.retrieve_matching_files( this_db + '/assembly', '.fna', debug) db_Dataframe.loc[len(db_Dataframe)] = [ 'user_data', entry, 'NaN', list_fna[0], 'NaN', 'NaN', this_db + '/mash' ] #### external_data ### TODO: Fix this mash_bin = "" #set_config.get_exe('mash') if any(name in 'Mash_external' for name in db_Dataframe['source'].to_list()): print( colored( "\t- external_data: including information from external data provided by user", 'green')) ## include user data db_Dataframe = db_Dataframe.set_index("db", drop=False) frame = db_Dataframe[db_Dataframe['source'] == 'Mash_external'] for index, row in frame.iterrows(): print('\t+ Reading information for file: ', row['db']) outfile = row['path'] + '.msh' if not os.path.exists(outfile): path_file = os.path.dirname(row['path']) this_db_file = min_hash_caller.sketch_database([row['path']], mash_bin, row['path'], row['db'], path_file) HCGB_aes.print_sepLine("*", 50, False) db_Dataframe.loc[row['db']] = [ 'Mash_external', row['db'], outfile, row['path'] ] ## index by id db_Dataframe = db_Dataframe.set_index("db", drop=False) return (db_Dataframe)
def parse_options(arg_dict): outdir = os.path.abspath(arg_dict.output_folder) ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options ## ATTENTION: df_accID merge generated dataframe ## --------------------------------------- ## ## GFF or GBF file ## --------------------------------------- ## if (arg_dict.annot_file): arg_dict.annot_file = os.path.abspath(arg_dict.annot_file) # *************************** ## ## multiple files provided # *************************** ## if (arg_dict.batch): ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Multiple annotation file provided option:', 'yellow') debug_message('arg_dict.annot_file: ' + arg_dict.annot_file, 'yellow') ## check if ok BacDup_functions.file_readable_check(arg_dict.annot_file) print( colored('\t* Multiple annotation files provided .......[OK]', 'green')) dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',') ## debug messages if (arg_dict.debug): debug_message('dict_entries: ', 'yellow') debug_message(dict_entries, 'yellow') debug_message('+++++++++++++++++++++++++++++++\n\n') # *************************** ## ## single file provided # *************************** ## else: dict_entries = {} print(colored('\t* Annotation file:.......[OK]', 'green')) if (arg_dict.sample_name): sample_name = arg_dict.sample_name else: sample_name = "sample" ## dict_entries[sample_name] = arg_dict.annot_file ## create dataframe df_accID to match other formats df_accID = pd.DataFrame( columns=(BacDup_functions.columns_accID_table())) for name, file_annot in dict_entries.items(): file_annot = os.path.abspath(file_annot) ## init all genome = "" prot = "" gff = "" gbk = "" plasmid_count = "" plasmid_id = "" ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message( 'dict_entries check annotation files provided option:', 'yellow') debug_message('name: ' + name, 'yellow') debug_message('file_annot: ' + file_annot, 'yellow') ## check file is valid BacDup_functions.file_readable_check(file_annot) ## get format format = format_checker.is_format(file_annot, arg_dict.debug) if (arg_dict.debug): debug_message('format: ' + format, 'yellow') ## parse accordingly taxonomy = "" organism = "" taxonomy_string = "" genus = "" if (format == 'gbk'): ## get information from each sample (taxonomy, organism) = BacDup.scripts.functions.get_gbk_information( file_annot, arg_dict.debug) ## plasmid_count, plasmid_id not available elif (format == 'gff'): if (arg_dict.ref_file): arg_dict.ref_file = os.path.abspath(arg_dict.ref_file) BacDup_functions.file_readable_check(arg_dict.ref_file) if (arg_dict.batch): ref_entries = HCGB_main.file2dictionary( arg_dict.ref_file, ',') genome = ref_entries[name] else: genome = arg_dict.ref_file ## save into dataframe if len(taxonomy) > 1: genus = taxonomy[-1] taxonomy_string = ";".join(taxonomy) dir_path = os.path.abspath(os.path.dirname(file_annot)) df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism, taxonomy_string, genome, file_annot, format, prot, plasmid_count, ";".join(plasmid_id)) ## --------------------------------------- ## ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1 ## --------------------------------------- ## elif (arg_dict.GenBank_id): ## get database path if (arg_dict.db_folder): db_folder = HCGB_files.create_folder( os.path.abspath(arg_dict.db_folder)) else: db_folder = HCGB_files.create_subfolder( "db", os.path.abspath(arg_dict.output_folder)) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('GenBank ID option:', 'yellow') debug_message('db_folder: ' + db_folder, 'yellow') # *************************** ## ## batch file # *************************** ## if (arg_dict.batch): arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id) ## debug messages if (arg_dict.debug): debug_message('GenBank ID batch file provided:', 'yellow') debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id, 'yellow') ## check is a file and readable BacDup_functions.file_readable_check(arg_dict.GenBank_id) print( colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]', 'green')) print() ## call IDs into a list and create tmp folder strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id) strains2get = list(filter(None, strains2get)) ## debug messages if (arg_dict.debug): debug_message('strains2get: ' + str(strains2get), 'yellow') ## call NCBI_downloader df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list( strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level) # *************************** ## ## single GenBank ID # *************************** ## else: ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Single NCBI GenBank IDs provided option:', 'yellow') debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id, 'yellow') debug_message('db_folder: ' + db_folder, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## download print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green')) print() HCGB_aes.print_sepLine("+", 75, False) df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload( arg_dict.GenBank_id, db_folder, arg_dict.debug) ## --------------------------------------- ## ## NCBI Taxonomy ID: ## --------------------------------------- ## elif (arg_dict.tax_id): ################# ## get tax ids ################# if (arg_dict.batch): print( colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]', 'green')) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Multiple NCBI Taxonomy IDs provided option:', 'yellow') ## check is a file and readable BacDup_functions.file_readable_check(arg_dict.tax_id) ## get IDs into a list taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id) else: print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green')) taxIDs2get = [arg_dict.tax_id] print() ################################## ## init ete NCBI taxonomy database ################################## print('+ Initiate NCBI taxonomy database...') ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug) string_info_total = [] for taxid in taxIDs2get: ## parse info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel', arg_dict.debug) print() ## debug messages if arg_dict.debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) debug_message('info\n', "yellow") print(info) ## append if more string_info_total.extend(info) ## convert to list of strings string_info_total = [str(int) for int in string_info_total] ## assume all belong to same superkingdom if children of same tax_id group_obtained = taxonomy_retrieval.get_superKingdom( string_info_total[0], ncbi, arg_dict.debug) ################# ## get database path ################# if (arg_dict.db_folder): db_folder = HCGB_files.create_folder( os.path.abspath(arg_dict.db_folder)) else: db_folder = HCGB_files.create_subfolder("db", outdir) ## debug messages if arg_dict.debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) debug_message('group_obtained: ' + group_obtained, "yellow") debug_message('db_folder: ' + db_folder, "yellow") debug_message( 'arg_dict.assembly_level: ' + arg_dict.assembly_level, "yellow") debug_message('arg_dict.section: ' + arg_dict.section, "yellow") ################################## ## get GenBank entries selected ################################## (strains2get, allstrains_available) = taxonomy_retrieval.get_GenBank_ids( db_folder, string_info_total, int(arg_dict.k_random), arg_dict.debug, assembly_level_given=arg_dict.assembly_level, group_given=group_obtained, section_given=arg_dict.section) ## print list and dictionary of possible and selected taxIDs outdir = os.path.abspath(arg_dict.output_folder) info_dir = HCGB_files.create_subfolder("info", outdir) input_info_dir = HCGB_files.create_subfolder("input", info_dir) HCGB_main.printList2file( os.path.join(input_info_dir, 'Downloaded.txt'), strains2get) HCGB_main.printList2file( os.path.join(input_info_dir, 'all_entries.txt'), allstrains_available) ## save into file file_info = os.path.join(input_info_dir, 'info.txt') ## stop here if dry_run if arg_dict.dry_run: print() HCGB_aes.print_sepLine("*", 75, False) print( "ATTENTION: Dry run mode selected. Stopping the process here.") HCGB_aes.print_sepLine("*", 75, False) print("+ All available entries listed and printed in file:\n\t" + os.path.join(input_info_dir, 'all_entries.txt')) print("+ Subset of entries generated and printed in file:\n\t" + os.path.join(input_info_dir, 'Downloaded.txt')) print( "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n" ) HCGB_aes.print_sepLine("*", 75, False) print() exit() ################# ## call NCBI_downloader ################# df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list( strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level) ## --------------------------------------- ## ## Previous BacDup analysis folder ## --------------------------------------- ## ## TODO elif (arg_dict.project): print( colored( '\t* A previous BacDup analysis project folder:.......[OK]', 'green')) ## create df_accID to store data ## TODO ## Returns dataframe with information df_accID = df_accID.set_index('new_name') return (df_accID)
def generate_db(file_abs_paths, name, fold_name, option, type_option, Debug, kma_bin): """Generate a call to create or update index KMA databases for later kmer identification. :param file_abs_paths: List of absolute paths fasta genome files to include in the database. :param name: Database name. :param fold_name: Directory path to store database generated. :param option: Generate a new database (option = 'new') or add to pre-existing database (option = 'add'). If database exists, automatically adds. :param type_option: Index genome fasta files one by one (option_type='single') or using a batch file containing multiple entries (option='batch'). :param kma_bin: Binary executable for KMA software :param Debug: True/False for debugging messages. :type file_abs_paths: list :type name: string :type fold_name: string :type option: string :type type_option: string :type kma_bin: :type Debug: bool :returns: Absolute path to database generated .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.readList_fromFile` - :func:`BacterialTyper.scripts.functions.printList2file` - :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed` - :func:`BacterialTyper.scripts.species_identification_KMA.index_database` """ print ('+ Updating the KMA database: ', name) ## check if len(file_abs_paths) > 1: ## read db in fold_name and get index files info = fold_name + '/' + name + '.db' ## lineList = [] toIndexList = [] indexedList = [] ### if os.path.exists(info): lineList = HCGB_main.readList_fromFile(info) option = 'add' for f in file_abs_paths: baseName = os.path.basename(f) ## check if already index if baseName in lineList: print (colored('\t+ File %s is already available in database %s' %(baseName, name), 'green')) indexedList.append(f) else: toIndexList.append(f) if toIndexList: ## generate batch and call info2 = fold_name + '/.batch_entries.txt' HCGB_main.printList2file(info2, toIndexList) status = index_database(info2, kma_bin, name, option, fold_name, type_option) final_list = set(lineList + toIndexList + indexedList) final_list_name = [os.path.basename(f) for f in final_list] HCGB_main.printList2file(info, final_list_name) count_files = len(toIndexList) print ('+ %s samples have been added to the database' %count_files) else: print ('\n+ No new sequences were added to the database.') return (fold_name + '/' + name) else: file_name = file_abs_paths[0] ## check if previously indexed status = check_db_indexed(file_name, fold_name) if (status): #true ## debug message if (Debug): print (colored("**DEBUG: Database (%s) is indexed" %file_name + " **", 'yellow')) return (file_name) else: #false ## debug message if (Debug): print (colored("**DEBUG: Database (%s) is not indexed" %file_name + " **", 'yellow')) status = index_database(file_name, kma_bin, file_name, option, fold_name, type_option) ## return if (status): #true return (file_name) else: return False
def check_db_indexed(index_name, folder): """ Check the status of a database :param index_name: Index name for the database :param folder: Absolute path of the folder containing the database. :type index_name: string :type folder: string :returns: True/False for the index status. .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.readList_fromFile` - :func:`BacterialTyper.scripts.functions.get_number_lines` - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.functions.print_time_stamp` """ # Each db consist of 5 files with the following extensions: b, comp.b, length.b, seq.b, name my_index_list = [".comp.b", ".index.b", ".length.b", ".name", ".seq.b"] print ("\t+ Checking if database has been previously indexed...") for sufix in my_index_list: ##print (sufix) my_file = index_name + sufix if os.path.isfile(my_file): print ("\t" + my_file + ' exists...') else: if (sufix == '.index.b'): continue else: return(False) ## check if previously assembled and succeeded filename_stamp = folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tDatabase was generated on: %s" %stamp, 'yellow')) ## Check if necessary to download again after several months/days days_passed = HCGB_time.get_diff_time(filename_stamp) print ("\t\t** %s days ago" %days_passed) ## download again if (days_passed > 60): print ("\t\t** Downloading information again just to be sure...") return(False) ## dump in screen names = index_name + '.name' count = HCGB_main.get_number_lines(names) print ("\n\t+ Database seems OK and contains several entries (%s):\n" %count) if (count > 50): print ("\tToo many entries in the database.\n\tCheck file %s for further details." %names) else: entries = HCGB_main.readList_fromFile(names) print (*entries, sep='\n') return(True)