def ariba_run_caller(db2use, db_name, list_files, folder_out, threads, cutoff): ## check if already is done # generate a stamp when finish parsing each file ## make stamp time filename_stamp = os.path.join(folder_out, '.success_' + db_name) if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) files_names = [os.path.basename(s) for s in list_files] print( colored( "\tA previous command generated results on: %s [Files: %s]" % (stamp, files_names), 'yellow')) else: if os.path.exists(folder_out): shutil.rmtree( folder_out) ## delete folder if exists but failed before ## call code = ariba_caller.ariba_run(db2use, list_files, folder_out, threads, cutoff) if code == 'FAIL': print("*** ERROR: System call failed for ", folder_out) ## print success timestamp HCGB_time.print_time_stamp(filename_stamp)
def init_db_object(debug): """Instantiate the ete taxonomy object Created by Joe R. J. Healey; Nick Youngblut Original code. """ # Instantiate the ete NCBI taxa object print("+ ------------------------------------- +") print("+ Looking for NCBI taxonomy database:") ncbi = NCBITaxa() ## dbfile location if debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") debug_message( 'NCBI Taxonomy database is stored at {}\n'.format(ncbi.dbfile), "yellow") ## folder would be download here: ~/.etetoolkit/taxa.sqlite db_folder = os.path.dirname(format(ncbi.dbfile)) ## check timestamp, update if necessary filename_stamp_parse = db_folder + '/timestamp_db.txt' if os.path.isfile(filename_stamp_parse): stamp = time_functions.read_time_stamp(filename_stamp_parse) days_passed = time_functions.get_diff_time(filename_stamp_parse) ## debug messages if debug: debug_message('Database previously initiated', "yellow") debug_message('on date: {}'.format(stamp), "yellow") debug_message('Days passed: {}'.format(days_passed), "yellow") if (days_passed > 30): ## update_db update_db(ncbi, db_folder, debug) else: ## debug messages if debug: debug_message('No need to update db', "yellow") debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, 'init database'), 'yellow')) else: ## create first timestamp time_functions.print_time_stamp(filename_stamp_parse) return ncbi
def create_blast_results(sample, fasta_file, outdir, debug): '''Creates BLAST results for each fasta vs. itself''' #phr is the header file, pin is the index file, psq is the sequence file ## debug messages if debug: debug_message('create_blast_results function call:', 'yellow') debug_message('sample: ' + sample, 'yellow') debug_message('fasta_file: ' + fasta_file, 'yellow') debug_message('outdir: ' + outdir, 'yellow') ## output file raw_blast = os.path.abspath(os.path.join(outdir, "BLAST_raw_results.tsv")) ## timestamps db_timestamp = os.path.join(outdir, '.db_success') search_timestamp = os.path.join(outdir, '.blast_success') if (not HCGB.functions.files_functions.is_non_zero_file(search_timestamp)): ## get binaries (makeblastdb_exe, blastp_exe) = BacDup.modules.config.get_exe('BLAST', debug) makeblastdb_exe = "/usr/bin/makeblastdb" blastp_exe = "/usr/bin/blastp" ## check if db is indexed already db_path_name = os.path.join(os.path.abspath(outdir), sample + '_db') if (not HCGB.functions.files_functions.is_non_zero_file(db_timestamp)): ## generate blastdb for genome HCGB.functions.blast_functions.makeblastdb(db_path_name, fasta_file, makeblastdb_exe, 'prot') # HCGB function ## print time stamp HCGB_time.print_time_stamp(db_timestamp) else: print (colored("\t+ BLAST database already available for sample %s [%s]" %(sample, read_time), 'green')) ## create blastp outfile HCGB.functions.blast_functions.blastp(blastp_exe, raw_blast, db_path_name, fasta_file, 1) # HCGB function ## print time stamp HCGB_time.print_time_stamp(search_timestamp) else: read_time = HCGB_time.read_time_stamp(search_timestamp) print (colored("\t+ Duplicate search already available for sample %s [%s]" %(sample, read_time), 'green')) return (raw_blast)
def run_module_SPADES_old(name, folder, file1, file2, threads): print ("+ Calling spades assembly for sample...", name) ## folder create HCGB_files.create_folder(folder) ## get configuration SPADES_bin = set_config.get_exe('spades') ## assembly main path_to_contigs = run_SPADES_assembly(folder, file1, file2, name, SPADES_bin, threads) ## assembly plasmids path_to_plasmids = run_SPADES_plasmid_assembly(folder, file1, file2, name, SPADES_bin, threads) ## discard plasmids from main (tmp_contigs, tmp_plasmids) = discardPlasmids(path_to_contigs, path_to_plasmids, folder, name) ## rename fasta sequences new_contigs = tmp_contigs.split(".fna.tmp")[0] + '.fna' rename_contigs(tmp_contigs, "scaffolds_chr", new_contigs) new_plasmids="" if os.path.isfile(tmp_plasmids): new_plasmids = tmp_plasmids.split(".fna.tmp")[0] + '.fna' rename_contigs(tmp_plasmids, "scaffolds_plasmids", new_plasmids) ## contig stats stats(new_contigs, new_plasmids) ## success stamps filename_stamp = folder + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp)
def download_VFDB_files(folder): ## ## Given a folder, check if it contains VFDB information ## or download it from website: http://www.mgc.ac.cn ## links = ( "http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz", "http://www.mgc.ac.cn/VFs/Down/Comparative_tables_from_VFDB.tar.gz") ## check if data is downloaded, how old is the data and if it is necessary to download again ## consider >30 days long enough to be updated again ## time stamp filename_stamp = folder + '/download_timestamp.txt' if os.path.exists(folder): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print("+ A previous download generated results on: ", stamp) days_passed = HCGB_time.get_diff_time(filename_stamp) print("\t\t** %s days ago" % days_passed) if (days_passed > 30): ## download again print( "\t\t** Downloading information again just to be sure...") else: print("\t\t** No need to download data again.") return () else: HCGB_files.create_folder(folder) ## Open file and readlines print('+ Downloading files:\n') for line in links: if not line.startswith('#'): HCGB_sys.wget_download(line, folder) ## decompress files print('+ Decompressing gzip files\n') files = os.listdir(folder) for item in files: #print (folder) if item.endswith('.gz'): HCGB_files.extract(folder + '/' + item, folder) ## make stamp time HCGB_time.print_time_stamp(filename_stamp) return ()
def module_call(sequence_fasta, kingdom, genus, path, name, threads): """ Function that checks and generates annotation. - It uses Prokka_ via :func:`BacterialTyper.scripts.annotation.prokka_call`. - It checks if previously generated - Once finished, it prints timestamp :param sequence_fasta: Assembled sequences in fasta file format. :param kingdom: Available kingdoms mode for Prokka software: Archaea|Bacteria|Mitochondria|Viruses :param genus: Available genus options for Prokka software. See details above. :param path: Absolute path to the output folder to include results. :param name: Sample name and tag to include in the annotation report and files. :param threads: Number of CPUs to use. :type sequence_fasta: string :type kingdom: string :type genus: string :type path: string :type name: string :type threads: integer .. seealso:: This function depends on other BacterialTyper functions called: - :func:`BacterialTyper.scripts.set_config.get_exe` - :func:`HCGB.functions.time_functions.read_time_stamp` - :func:`HCGB.functions.time_functions.print_time_stamp` - :func:`HCGB.functions.time_functions.prokka_call` .. include:: ../../links.inc """ ## check if previously assembled and succeeded filename_stamp = path + '/.success' if os.path.isdir(path): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) return () ## call prokka prokka_bin = set_config.get_exe('prokka') dirname = prokka_call(prokka_bin, sequence_fasta, kingdom, genus, path, name, threads) ## success stamps filename_stamp = path + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) return (dirname)
def SPADES_systemCall(sample_folder, file1, file2, name, SPADES_bin, options, threads, debug=False): """Generate SPADES system call. It calls system for SPADES and generates time stamp file in the folder provided (sample_folder + '/.success_assembly') for later analysis. Steps: - It generates system call for SPADES assembly. - It generates timestamp file. :param sample_folder: Absolute path to store results. It must exists. :param file1: Absolute path to fastq reads (R1). :param file2: Absolute path to fastq reads (R2). :param name: Sample name or tag to identify sample. :param SPADES_bin: Binary executable for SPADES assembly software. :param options: Plasmid assembly is possible if specificed via options (--plasmid). :param threads: Number of CPUs to use. :type name: string :type sample_folder: string :type file1: string :type file2: string :type SPADES_bin: string :type options: string :type threads: integer :return: Returns **OK** if assembly process succeeded and fasta file is generated. :rtype: string. :warnings: Returns **FAIL** if assembly process stopped. .. seealso:: This function depends on other BacterialTyper functions called: - :func:`HCGB.functions.main_functions.system_call` - :func:`HCGB.functions.time_functions.print_time_stamp` """ ## check if previously assembled and succeeded filename_stamp = sample_folder + '/.success_assembly' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) return('OK') ## call system for SPADES sample given logFile = sample_folder + '/' + name + '.log' ## command cmd_SPADES = '%s %s-t %s -o %s -1 %s -2 %s > %s 2> %s' %(SPADES_bin, options, threads, sample_folder, file1, file2, logFile, logFile) code = HCGB_sys.system_call(cmd_SPADES) if (code == 'OK'): ## success stamps filename_stamp = sample_folder + '/.success_assembly' stamp = HCGB_time.print_time_stamp(filename_stamp) return('OK') return "FAIL"
def update_db(ncbi_db, db_folder, debug): """Update database Created by Joe R. J. Healey; Nick Youngblut Original code. """ ## debug messages if debug: debug_message('Update database at {}\n'.format(ncbi_db.dbfile), "yellow") print('Updating the taxonomy database. This may take several minutes...\n') ncbi_db.update_taxonomy_database() ## print timestamp filename_stamp_parse = os.path.abspath(db_folder + '/timestamp_db.txt') time_functions.print_time_stamp(filename_stamp_parse) return ncbi_db
def BUSCO_run(sample_name, fasta, threads, output_name, dataset_name, mode, busco_db): my_out_folder = os.path.join(output_name, dataset_name + '/run_' + dataset_name) ## timestamp filename_stamp = my_out_folder + '/.success' print (colored("\tBUSCO Dataset [%s]; Sample [%s]" %(dataset_name, sample_name), 'yellow')) ## check previous run if os.path.isfile(filename_stamp): timestamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tSuccessfully run on date: %s" %timestamp, 'green')) else: busco_bin = set_config.get_exe('busco') os.chdir(output_name) ## init cmd configuration cmd = '%s -f -i %s -c %s --mode %s --download_path %s ' %(busco_bin, fasta, threads, mode, busco_db) ## options if autolineage or given dataset if "auto-lineage" == dataset_name: logFile = 'auto_lineage.log' cmd = cmd + '--auto-lineage -o %s > %s' %(dataset_name, logFile) else: logFile = dataset_name + '.log' cmd = cmd + '-l %s -o %s > %s' %(dataset_name, dataset_name, logFile) ## system call HCGB_sys.system_call(cmd) if os.path.isfile(my_out_folder + '/short_summary.txt'): ## timestamp HCGB_time.print_time_stamp(filename_stamp) else: print (colored("BUSCO failed: Dataset [%s]; Sample [%s]" %(dataset_name, fasta), 'red')) return ('FAIL') return()
def agrvate_caller(dict_assemblies, dict_folders, debug=False): """Create agrvate call and control for parameters""" ## ATTENTION: agrvate needs to chdir to output folder path_here = os.getcwd() print ("+ Checking agr genes for each sample retrieved...") agrvate_results = pd.DataFrame() ## No need to optimize. There is a problem with the working dir of agrvate and we ## need to change every time. for name, assembly_file in dict_assemblies.items(): sample_folder = HCGB_files.create_folder(dict_folders[name]) ## check if previously done and succeeded filename_stamp = sample_folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) else: os.chdir(sample_folder) info_sample = agrvate_call(name, assembly_file, sample_folder, debug) agrvate_results = pd.concat([agrvate_results, info_sample], join='outer') if (info_sample.shape[0] == 0): print("+ Some error occurred with sample %s. Please re-run analysis or check log files." %name) else: ## success HCGB_time.print_time_stamp(filename_stamp) print ("+ Jobs finished%s\n+ Collecting information for all samples...") os.chdir(path_here) ## debug messages if debug: HCGB_aes.debug_message('agrvate_results', 'yellow') HCGB_main.print_all_pandaDF(agrvate_results) return(agrvate_results)
def mapReads_caller(files, folder, name, threads, STAR_exe, genomeDir, limitRAM_option, Debug): ## check if previously joined and succeeded filename_stamp = folder + '/.success' if os.path.isfile(filename_stamp): stamp = time_functions.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s -- %s]" % (stamp, name, 'STAR'), 'yellow')) else: ## if Debug: print("\n** DEBUG: mapReads_caller options **\n") print("folder: " + folder) print("name: " + name) print("threads: " + str(threads)) print("STAR_exe: " + STAR_exe) print("genomeDir: " + genomeDir) print("limitRAM_option: " + str(limitRAM_option)) print("files: ") print(files) # Call STAR code_returned = mapReads.mapReads("LoadAndKeep", files, folder, name, STAR_exe, genomeDir, limitRAM_option, threads, Debug) if (code_returned): time_functions.print_time_stamp(filename_stamp) else: print("+ Mapping sample %s failed..." % name) ## return results bam_file = os.path.join(folder, 'Aligned.sortedByCoord.out.bam') mapping_results[name] = bam_file return ()
def kma_ident_call(out_file, files, sample_name, index_name, kma_bin, option, threads): """Create kma system call for kmer identification. Paired-end end or single end fastq files accepted. It generates a time stamp if succeeds. :param out_file: Absolute path and basename for the output files generated with results. :param files: List of absolute paths for fastq files to search againts the database. :param sample_name: Directory path to store database generated. :param index_name: Database name :param kma_bin: Binary executable for KMA software. :param option: Additional options to pass to the system call. :param threads: Number of CPUs to use. :type out_file: string :type files: list :type sample_name: string :type index_name: string :type kma_bin: string :type option: string :type threads: integer :returns: System call returned finish status. .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.system_call` - :func:`BacterialTyper.scripts.functions.print_time_stamp` """ ### out_file_log = out_file + '.log' if len(files) == 2: cmd_kma_search = "%s -ipe %s %s -o %s -t_db %s -t %s %s 2> %s" %(kma_bin, files[0], files[1], out_file, index_name, threads, option, out_file_log) else: ## TODO: test Single End cmd_kma_search = "%s -i %s -o %s -t_db %s -t %s %s 2> %s" %(kma_bin, files[0], out_file, index_name, threads, option, out_file_log) code = HCGB_sys.system_call(cmd_kma_search) if (code == 'OK'): ## success stamps basename_tag = os.path.basename(out_file) folder = os.path.dirname(out_file) filename_stamp = folder + '.success_' + basename_tag stamp = HCGB_time.print_time_stamp(filename_stamp) return('OK') else: return('FAIL')
def prepare_card_data(database_folder): ## create CARD folder abs_folder = os.path.abspath(database_folder) CARD_folder = HCGB_files.create_subfolder('CARD', abs_folder) ## make stamp time filename_stamp = CARD_folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [CARD Ontology Data]" %stamp, 'yellow')) ## check time passed days_passed = HCGB_time.get_diff_time(filename_stamp) print ("\t** %s days ago" %days_passed) if (days_passed > 30): ## download again print ("\t ** Downloading information again just to be sure...") download=True else: print ("\t ** No need to download data again.") download=False else: download=True ### if download: ## uptade database in a path aro_obo_file = card_trick.ontology_functions.update_ontology(CARD_folder, False) ## get ontology and save it in csv return_frame = card_trick.ontology_functions.parse_ontology(aro_obo_file, False) ### if success return folder name if not return_frame.empty: ## success stamps filename_stamp = CARD_folder + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) else: return (FAIL) ## return folder name return(CARD_folder)
def snippy_variant_caller(reference, files, threads, outdir, name, contig_option, other_options, sample_name, Debug): ## create subfolder within phylo for this mapping tag = sample_name + '_vs_' + name subdir = HCGB_files.create_subfolder(tag, outdir) ## check if previously process and succeeded filename_stamp = subdir + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, tag), 'yellow')) else: # Call variant calling code = variant_calling.snippy_call(reference, files, threads, subdir, sample_name, contig_option, other_options, Debug) if code == 'OK': stamp = HCGB_time.print_time_stamp(filename_stamp) return(code)
def parse_featureCount(out_file, path, name, bam_file, Debug): """ Parses featureCount results for RNAbiotype analysis. :param out_file: Name provided to featureCount for output results. :param path: :param name: """ ## file names out_tsv_file_name = out_file + '.tsv' RNA_biotypes_file_name = os.path.join(path, name + '_RNAbiotype.tsv') ## filename_stamp_parse = path + '/.success_parse' if os.path.isfile(filename_stamp_parse): stamp = time_functions.read_time_stamp(filename_stamp_parse) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'parse results'), 'yellow')) else: ## debugging messages if Debug: print ("** DEBUG:") print ("Parse results for sample: " + name) ## parse results out_tsv_file = open(out_tsv_file_name, 'w') RNA_biotypes_file = open(RNA_biotypes_file_name, 'w') tRNA_count = 0 ########################################## ### read count file ########################################## count_file = open(out_file) count_file_text = count_file.read() count_file_lines = count_file_text.splitlines() for line in count_file_lines: if line.startswith('#'): continue elif line.startswith('Geneid'): continue else: ID = line.split('\t')[0] count = int(line.split('\t')[-1]) string2write_raw = "%s\t%s\n" %(ID, count) out_tsv_file.write(string2write_raw) tRNA_search = re.search(r".*tRNA", ID) if tRNA_search: tRNA_count = int(tRNA_count) + int(count) elif (count > 0): RNA_biotypes_file.write(string2write_raw) ## count and summary tRNA string2write = "tRNA\t%s\n" %tRNA_count RNA_biotypes_file.write(string2write) RNA_biotypes_file.close() ########################################## ### read summary count file ########################################## summary_count_file = open(out_file + '.summary') summary_count_file_text = summary_count_file.read() summary_count_file_lines = summary_count_file_text.splitlines() for line in summary_count_file_lines: if line.startswith('Status'): continue elif line.startswith('Assigned'): continue else: ## adds Unassigned_Ambiguity ## adds Unassigned_NoFeatures ID = line.split('\t')[0] count = int(line.split('\t')[-1]) ## skip empty entries if count == 0: continue string2write_raw = "%s\t%s\n" %(ID, count) out_tsv_file.write(string2write_raw) ########################################## ## get mapping statistics according to mapping software ########################################## count_multi = 0 count_unmap = 0 mapping_folder = os.path.dirname(bam_file) mapping_stats = mapping_folder + '/Log.final.out' ## -------------------------------- ## ### STAR mapping ## -------------------------------- ## if files_functions.is_non_zero_file(mapping_stats): ## debugging messages if Debug: print ("** DEBUG:") print ("STAR mapping available for sample: " + name) print ("mapping_folder: " + mapping_folder) mapping_stats_file = open(mapping_stats) mapping_stats_file_text = mapping_stats_file.read() mapping_stats_file_lines = mapping_stats_file_text.splitlines() for line in mapping_stats_file_lines: multi_search = re.search(r".*Number of reads mapped to", line) unmap_search = re.search(r".*unmapped.*", line) input_search = re.search(r".*input reads.*", line) if input_search: total_input_reads = int(line.split('\t')[-1]) if multi_search: count_tmp = int(line.split('\t')[-1]) count_multi = count_multi + count_tmp elif unmap_search: perc_tmp = line.split('\t')[-1] count_reads = math_functions.percentage(perc_tmp, total_input_reads) count_unmap = count_unmap + count_reads else: ## -------------------------------- ## ## tophat ## -------------------------------- ## mapping_stats = mapping_folder + '/align_summary.txt' count_map = 0 total_input_reads = 0 if files_functions.is_non_zero_file(mapping_stats): ## debugging messages if Debug: print ("** DEBUG:") print ("tophat mapping available for sample: " + name) print ("mapping_folder: " + mapping_folder) mapping_stats_file = open(mapping_stats) mapping_stats_file_text = mapping_stats_file.read() mapping_stats_file_lines = mapping_stats_file_text.splitlines() for line in mapping_stats_file_lines: map_search2 = re.search(r"Aligned.*\:\s+(\d+).*", line) input_search2 = re.search(r".*Input.*\:\s+(\d+).*", line) if input_search2: total_input_reads = input_search2.group(1) if map_search2: count_map = map_search2.group(1) #### count_unmap = int(total_input_reads) - int(count_map) else: ## other print ("Neither tophat or STAR..., no mapping statistics") ### print mapping stats string2write_unmap = "unmapped\t%s\n" %count_unmap out_tsv_file.write(string2write_unmap) ## close files out_tsv_file.close() ## print timestamp time_functions.print_time_stamp(filename_stamp_parse) return(out_tsv_file_name, RNA_biotypes_file_name)
def NCBIdownload(acc_ID, data, data_folder): ## module ngd requires to download data in bacteria subfolder under genbank folder dir_path = os.path.join(data_folder, 'genbank', 'bacteria', acc_ID) ngd_download(dir_path, acc_ID, data_folder) ## get files download (genome, prot, gff, gbk) = get_files_download(dir_path) ## check if any plasmids downloaded plasmid_count = 0 plasmid_id = [] contig_out_file = dir_path + '/' + acc_ID + '_chromosome.fna' plasmid_out_file = dir_path + '/' + acc_ID + '_plasmid.fna' ## open contig_out_file_handle = open(contig_out_file, 'w') for seq_record in SeqIO.parse(genome, "fasta"): plasmid_search = re.search(r".*plasmid.*", seq_record.description) if plasmid_search: ## count and get names for plasmids plasmid_count += 1 name = str(seq_record.id) plasmid_id.append(name) ### Separate plasmids from main sequence plasmid_out_file_handle = open(plasmid_out_file, 'a') plasmid_out_file_handle.write(seq_record.format("fasta")) plasmid_out_file_handle.write('\n') plasmid_out_file_handle.close() else: contig_out_file_handle.write(seq_record.format("fasta")) contig_out_file_handle.write('\n') contig_out_file_handle.close() ## no plasmids found if plasmid_count == 0: plasmid_out_file = "" data2download = pd.DataFrame(columns=('ID', 'folder', 'genus', 'species', 'name', 'genome', 'chr', 'GFF', 'GBK', 'proteins', 'plasmids_number', 'plasmids_ID', 'plasmids')) data2download.loc[len(data2download)] = (acc_ID, dir_path, data.loc[acc_ID]['genus'], data.loc[acc_ID]['species'], data.loc[acc_ID]['name'], genome, contig_out_file, gff, prot, gbk, plasmid_count, "::".join(plasmid_id), plasmid_out_file) ## dump to file info_file = dir_path + '/info.txt' data2download.to_csv(info_file) ## timestamp filename_stamp = dir_path + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) ## return data return (info_file)
def check_sample_assembly(name, sample_folder, files, threads): """Checks if sample is assembled. It checks whether a sample is assembled or not by reading file *sample_folder/.success_all*. If file not available (no previous assembly or not suceeded it) it calls :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly` to generate assembly for the sample speficied. :param name: Sample name or tag to identify sample. :param sample_folder: directory to generate assembly ouptut. It must exist. :param files: List containing files (fastq R1 & R2) for the sample to be assembled. :param threads: Number of CPUs to use :type name: string :type sample_folder: string :type files: list :type threads: integer :return: Populates dictionary assembly_stats with assembly stats dictionary information :rtype: Dataframe .. seealso:: This function depends on other BacterialTyper and HCGB functions called: - :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly` """ ## check if previously assembled and succeeded filename_stamp = sample_folder + '/.success_all' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) ## Get information stat_output = { 'Contig Stats': HCGB_main.file2dictionary( sample_folder + '/' + name + '_assembly-contigs.csv', ','), 'Scaffold Stats': HCGB_main.file2dictionary( sample_folder + '/' + name + '_assembly-scaffolds.csv', ',') } ## populate main dictionary assembly_stats[name] = [ stat_output, sample_folder + '/' + name + '_assembly_stats.xlsx' ] else: ## debug message if (Debug): HCGB_aes.debug_message( "spades_assembler.run_module_assembly call:", "yellow") print("spades_assembler.run_module_assembly " + name + "\t" + sample_folder + "\t" + files[0] + "\t" + files[1] + "\t" + str(threads) + "\n") # Call spades_assembler code = spades_assembler.run_module_assembly(name, sample_folder, files[0], files[1], threads) if (code != 'FAIL'): ## success stamps filename_stamp = sample_folder + '/.success_all' stamp = HCGB_time.print_time_stamp(filename_stamp) assembly_stats[ name] = code # list containing dictionary of data and excel else: print( "Some error occurred for sample %s while generating the assembly. " % name)
def run_search(arg_dict): """Main function of the search module in BacDup package. This module searches and create gene duplication analysis. It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user annotation data) or a single or multiple samples. """ ## help message if (arg_dict.input_help): help_input() exit() if (arg_dict.blast_help): info.blast_help() exit() if (arg_dict.project_help): info.project_help() exit() if (arg_dict.detached_mode_help): info.detached_mode() exit() ### Start the analysis BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Search module") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out outdir = os.path.abspath(arg_dict.input_folder) ## project or detached? if arg_dict.detached: arg_dict.project = False ## output folder print("\n+ Create output folder(s):") HCGB.functions.files_functions.create_folder(outdir) else: arg_dict.project = True ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting information provided... ') print('+ Several options available:') print('\t* BacDup project folder with initiated data') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\t* Single/Multiple raw BLAST results files') print('\t* Single/Multiple fasta proteins + annotation table') print("""\n\n**** NOTE: **** For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs) use the input module to accommodate accordingly """) time.sleep(1) print() ## parse options pd_samples_retrieved = parse_search_options(arg_dict) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## for each sample dict_search_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "search", arg_dict.debug) dict_dup_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug) dict_parse_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "parse", arg_dict.debug) ## create results data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table()) for sample, folder in dict_search_folders.items(): annot_timestamp = os.path.join(dict_dup_folders[sample], '.annot_success') dup_annot_file = os.path.join(dict_dup_folders[sample], 'dup_annot.csv') ## annotation annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table'] if (not HCGB.functions.files_functions.is_non_zero_file( annot_timestamp)): ## get results file_data = pd_samples_retrieved.loc[sample, 'file_data'] format = pd_samples_retrieved.loc[sample, 'format'] filtered_data = dup_searcher.filter_data( sample, file_data, format, arg_dict.pident, arg_dict.evalue, arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug) ## timestamps filter_timestamp = os.path.join(dict_dup_folders[sample], '.filter_success') if (not HCGB.functions.files_functions.is_non_zero_file( filter_timestamp)): #save results as a .csv file sort_csv = os.path.abspath( os.path.join(dict_dup_folders[sample], 'filtered_results.csv')) filtered_data.to_csv(sort_csv, header=True, index=False) ## print time stamp HCGB_time.print_time_stamp(filter_timestamp) else: read_time = HCGB_time.read_time_stamp(filter_timestamp) print( colored( "\t+ Filter results already available for sample %s [%s]" % (sample, read_time), 'green')) ## get annotation (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot( sample, filtered_data, annot_table_file, arg_dict.debug) ## info_dup_file = os.path.join(dict_dup_folders[sample], 'info_dup.csv') data2add_entry.to_csv(info_dup_file, header=True, index=False) ## save into file dup_annot_df.to_csv(dup_annot_file, header=True) ## print time stamp HCGB_time.print_time_stamp(annot_timestamp) else: read_time = HCGB_time.read_time_stamp(annot_timestamp) print( colored( "\t+ Duplicate annotation already available for sample %s [%s]" % (sample, read_time), 'green')) ## add info for each dup_annot_df = HCGB_main.get_data(dup_annot_file, ',', "index_col=0") annot_table = HCGB_main.get_data(annot_table_file, ',', "index_col=0") data2add_entry = dup_searcher.get_dup_stats( sample, dup_annot_df, annot_table, arg_dict.debug) ## add genome length data data2add_entry['genome_len'] = '' len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv') if os.path.isfile(len_df_file): len_data = HCGB_main.get_data(len_df_file, ',', "header=None") data2add_entry['genome_len'] = len_data[1].sum() ## merge data #data2add_entry = data2add_entry.reset_index() data2add = data2add.append(data2add_entry, ignore_index=False) ### report generation HCGB_aes.boxymcboxface("Summarizing duplicated search") outdir_report = HCGB.functions.files_functions.create_subfolder( "report", outdir) dups_report = HCGB.functions.files_functions.create_subfolder( "dups", outdir_report) ## add data2add data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting search module.") return ()
def ariba_getref(database, outdir, Debug, threads): ###################################################################################### ## usage: ariba getref [options] <db> <outprefix> ###################################################################################### ## Download reference data from one of a few supported public resources ## positional arguments: ## DB name Database to download. Must be one of: argannot card megares plasmidfinder resfinder srst2_argannot vfdb_core vfdb_full virulencefinder ## outprefix Prefix of output filenames ###################################################################################### ## where database is one of: ## argannot, card, megares, plasmidfinder, resfinder, ## srst2_argannot, vfdb_core, vfdb_full, virulencefinder. ## folders outdir_name = outdir + '/' + database outdir_prepare_ref = outdir + '_prepareref' ## download information in database folder provided by config print ("\t+ Retrieve information from database: " + database) ## check if previously downloaded and succeeded filename_stamp = outdir + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) download_ariba_cmd = 'OK' else: cmd_getref = 'ariba getref %s %s' %(database, outdir_name) download_ariba_cmd = HCGB_sys.system_call(cmd_getref) if (download_ariba_cmd == 'OK'): stamp = HCGB_time.print_time_stamp(filename_stamp) ## debug message if (Debug): print (colored("**DEBUG: ariba getref %s succeed " %database + "**", 'yellow')) else: ## rise error & exit print (colored("***ERROR: ariba getref %s failed " %database + " **",'red')) return('FAIL') ## debug message if (Debug): print (colored("**DEBUG: Run ariba prepareref %s " %database + "**", 'yellow')) ## check if previously prepareref and succeeded filename_stamp_prepare = outdir_prepare_ref + '/.success' if os.path.isfile(filename_stamp_prepare): stamp = HCGB_time.read_time_stamp(filename_stamp_prepare) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) else: ## get information list_files = os.listdir(outdir) fasta = "" metadata = "" for f in list_files: if f.endswith('tsv'): metadata = outdir + '/' + f elif f.endswith('fa'): fasta = outdir + '/' + f code = ariba_prepareref(fasta, metadata, outdir_prepare_ref, threads) if (code == 'OK'): filename_stamp = outdir_prepare_ref + '/.success' HCGB_time.print_time_stamp(filename_stamp_prepare) return()
def results_parser(database, folderResults, sampleName, outfolder, assembly_cutoff, card_trick_info): """Parse ARIBA results This function basically extracts files and generated additionally information for later parse according to type of database provided. .. seealso:: Additional information to ARIBA results generated. - :ref:`ARIBA-explained` """ if not os.path.exists(folderResults): print( "+ Finish parsing information for sample [%s]. Results folder does not exist." % sampleName) return ('NaN', 'NaN') ## get files list_files = os.listdir(folderResults) ## init assemblies = "" assemled_genes = "" fileResults = "" print("\n+ Parsing result file for sample: ", sampleName) ## extract files print("\n+ Extracting files if necessary:") for f in list_files: filePath = os.path.join(folderResults, f) if f.endswith('.gz'): HCGB_files.extract(filePath, folderResults) if (f == 'report.tsv'): fileResults = filePath elif (f == 'assemblies.fa.gz'): assemblies = os.path.join(folderResults, 'assemblies.fa') elif (f == 'assembled_genes.fa.gz'): assemled_genes = os.path.join(folderResults, 'assembled_genes.fa') print("\n") ## no results generated if not HCGB_files.is_non_zero_file(fileResults): print('+ No results generated for sample: ', sampleName) return ('', '') ### expand flags flagResults = folderResults + '/flags_explain.tsv' fileFlags = ariba_caller.ariba_expandflag(fileResults, flagResults) ###################### ## generate summary ###################### ## ## ariba has function that generates a summary for samples ## summary_results_tmp = folderResults + '/report_summary_tmp' summary_results = folderResults + '/report_summary.csv' options = "--no_tree" ## Info ## https://github.com/sanger-pathogens/ariba/wiki/The-assembled-column-from-ariba-summary ariba_caller.ariba_summary(summary_results_tmp, [fileResults], options) ## fix names: just for aesthetics fake_dict = {sampleName: fileResults} ariba_caller.fix_ariba_summary(summary_results_tmp + '.csv', summary_results, fake_dict) os.remove(summary_results_tmp + '.csv') ############################################ ### check results according to database ############################################ if (database == 'vfdb_full'): (name_excel, name_csv) = parse_vfdb(outfolder, sampleName, fileResults, fileFlags, summary_results, assembly_cutoff) elif (database == 'card'): (name_excel, name_csv) = parse_card(outfolder, sampleName, fileResults, fileFlags, summary_results, assembly_cutoff, card_trick_info) else: ## [TODO] check results according to databases different than CARD/VFDB (name_excel, name_csv) = parse_results(outfolder, sampleName, fileResults, fileFlags, summary_results) print('\tCheck additional information on ', name_excel) ## print success timestamp filename_stamp = outfolder + '/.success_' + database stamp = HCGB_time.print_time_stamp(filename_stamp) return (name_excel, name_csv)
def parse_information(arg_dict, df_accID, outdir): ### Parse df_accID dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "input", arg_dict.debug) dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "parse", arg_dict.debug) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') print("dict_input_folders") print(dict_input_folders) print("dict_parse_folders") print(dict_parse_folders) ## parse each sample retrieved for sample, folder_input in dict_input_folders.items(): if (arg_dict.debug): debug_message('sample: ' + sample, 'yellow') debug_message('folder_input: ' + folder_input, 'yellow') debug_message('folder_parse: ' + dict_parse_folders[sample], 'yellow') debug_message('annot_file: ' + df_accID.loc[sample, 'annot_file'], 'yellow') debug_message('genome' + df_accID.loc[sample, 'genome'], 'yellow') ## timestamps input_timestamp = os.path.join(folder_input, '.success') parse_timestamp = os.path.join(dict_parse_folders[sample], '.success') print() print("\t+ Parsing sample: " + sample) if (not HCGB_files.is_non_zero_file(parse_timestamp) and not HCGB_files.is_non_zero_file(input_timestamp)): ## TODO: Set threads to use in parallel process_OK = parse_annot_file(sample, folder_input, df_accID.loc[sample, 'annot_file'], dict_parse_folders[sample], arg_dict.debug, df_accID.loc[sample, 'genome']) if (process_OK): ## link or copy annotation file into folder_input HCGB_files.get_symbolic_link_file( df_accID.loc[sample, 'annot_file'], folder_input) ## add df_accID.loc[sample,] information as csv into input folder df_accID.loc[sample, ].to_csv(os.path.join( folder_input, 'info.csv'), index=True, header=True) ## print time stamp HCGB_time.print_time_stamp(input_timestamp) ## print time stamp HCGB_time.print_time_stamp(parse_timestamp) else: print( colored( "\t+ Some error occurred for sample %s while parsing input options" % sample, 'red')) ## print time stamp HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail')) ## print time stamp HCGB_time.print_time_stamp( os.path.join(dict_parse_folders[sample], '.fail')) else: read_time = HCGB_time.read_time_stamp(parse_timestamp) print( colored( "\t+ Input parsing already available for sample %s [%s]" % (sample, read_time), 'green')) print()
def send_kma_job(outdir_file, list_files, name, database, threads, Debug): """ Executes KMA identification jobs This function automates the process of checking if any previous run succeeded or runs the appropiate identification process for the sample and database provided. :param outdir_file: :param list_files: :param name: :param database: :param threads: :param dataFrame_sample: :type outdir_file: :type list_files: :type name: :type database: :type threads: :type dataFrame_sample: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.config.set_config.get_exe` - :func:`BacterialTyper.scripts.species_identification_KMA.kma_ident_call` - :func:`BacterialTyper.module.ident.get_outfile` - :func:`BacterialTyper.scripts.functions.read_time_stamp` """ if (Debug): print(colored("**DEBUG: ident.send_kma_job call**", 'yellow')) print("outdir_file") print(outdir_file) print("list_files") print(list_files) print("name: " + name) print("database: " + database) ## outdir_KMA outdir_dict_kma = HCGB_files.create_subfolder("kma", outdir_file) ## set defaults kma_bin = set_config.get_exe("kma") ## get outfile outfile = get_outfile(outdir_dict_kma, name, database) ## check if previously run and succeeded basename_tag = os.path.basename(outfile) filename_stamp = outdir_dict_kma + '/.success_' + basename_tag if (Debug): print("Outdir: ", outdir_dict_kma) print("outfile: ", outfile) print("Filename_stamp: ", filename_stamp) if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) else: ## debug message if (Debug): print( colored( "**DEBUG: species_identification_KMA.kma_ident_module call**", 'yellow')) print("outfile = get_outfile(outdir_dict_kma, name, db2use)") print("outfile: ", outfile) print( "species_identification_KMA.kma_ident_module(outfile, list_files, name, database, threads) " ) print("species_identification_KMA.kma_ident_module" + "\t" + outfile + "\t" + str(list_files) + "\t" + name + "\t" + database + "\t" + str(threads) + "\n") ## Sparse or not #if any(name in basename_tag for name in ['userData_KMA', 'genbank_KMA']): # if (basename_tag == 'userData_KMA'): # option = '' # else: # option = '-Sparse ' ## Add option to retrieve databse from memory option = "" option = option + '-shm 1' # Call KMA species_identification_KMA.kma_ident_call(outfile, list_files, name, database, kma_bin, option, threads) stamp = HCGB_time.print_time_stamp(filename_stamp)
def download_kma_database(folder, database, debug): """ Downloads databases from KMA website. Using the latest available ftp datasets, this function downloads available datasets using function :func:`BacterialTyper.scripts.functions.wget_download`. Ftp site: "ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/latest/" It also downloads the md5sum for the dataset selected and compares with the :param folder: Absolute path to folder that contains database. :param database: Possible options: [bacteria, archaea, protozoa, fungi, plasmids, typestrains, viral]. :param debug: True/false for printing debugging messages. :type folder: string :type database: string :type debug: boolean .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.wget_download` - :func:`BacterialTyper.scripts.functions.check_md5sum` - :func:`BacterialTyper.scripts.functions.extract` - :func:`BacterialTyper.scripts.functions.print_time_stamp` - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed` """ ## ToDo: update with latest version ftp_site = "http://www.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/latest/" ## In v20190107 there was a plasmid database. #ftp_site = "ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/20190107/" ############################################################################ ## ToDo: Set automatic: download config file and look for prefix for each ## sample and generate a dictionary to code the prefix for each db. ############################################################################ # Database configuration file - Describes the content of the database # Each db consist of 5 files with the following extensions: b, comp.b, length.b, seq.b, name # Other important files are: .name, .kma.entries.all, .kma.entries.deleted, .kma.entries.added, .md5 # db_prefix name description #bacteria.ATG Bacteria Organisms Bacteria organisms library prefix=ATG #plasmids.T Bacteria Plasmids Bacteria plasmids library prefix=T #typestrains.ATG Bacteria Type Strains Bacteria type strains library prefix=ATG #fungi.ATG Fungi Fungi library prefix=ATG #protozoa.ATG Protozoa Protozoa library prefix=ATG #archaea.ATG Archaea Archaea library prefix=ATG HCGB_files.create_folder(folder) ## debug message if (debug): print (colored("Function call: download_kma_database " + folder + ' ' + database + '\n','yellow')) ## prefix if (database == 'plasmids'): prefix = '.T' elif (database == 'viral'): prefix = '.TG' else: prefix = '.ATG' index_name = os.path.join(folder, database + prefix) ## check if already download return_code_down = False if os.path.exists(folder): return_code_down = check_db_indexed(index_name, folder) ## debug message if (debug): print (colored("Folder database is already available:" + folder,'yellow')) if (return_code_down == False): ## folder does not exists ## Download data print ("\t+ Downloading data now, it may take a while....") ## debug message if (debug): print (colored("Download files via function wget_download:",'yellow')) ## connect to url url = ftp_site + database + '.tar.gz' HCGB_sys.wget_download(url, folder) md5_url = ftp_site + database + '.md5' HCGB_sys.wget_download(md5_url, folder) print ("\n\t+ Data downloaded.....") ## get files files = os.listdir(folder) md5_sum = "" for f in files: if f.endswith('tar.gz'): tar_file = folder + '/' + f elif f.endswith('md5'): md5_sum = folder + '/' + f ## check md5sum print ("\t+ Checking for integrity using md5sum") # get md5 sum from source md5_string = "" with open(md5_sum, 'r') as myfile: line = myfile.read() line = re.sub(r"\s", ',', line) md5_string = line.split(",")[0] ## calculate md5 for file result_md5 = HCGB_sys.check_md5sum(md5_string, tar_file) ## FIXME: Not conda supported if (result_md5 == True): ## debug message if (debug): print (colored("result md5sum matches code provided for file " + tar_file,'yellow')) # extract print ("\t+ Extracting database into destination folder: " + folder) HCGB_files.extract(tar_file, folder) else: print (colored("*** ERROR: Some error occurred during the downloading and file is corrupted ***", 'red')) return ("Error") ## database should be unzipped and containing files... return_code_extract = check_db_indexed(index_name, folder) if (return_code_extract): print("+ Database (%s) successfully extracted in folder: %s..." %(database, folder)) else: string = "*** ERROR: Some error occurred during the extraction of the database (%s). Please check folder (%s) and downloading and file is corrupted ***" %(database, folder) print (colored(string, 'red')) return ("Error") ## print timestamp filename_stamp = folder + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp)
def pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug): ## filename_stamp_plot = folder + '/.success_plot' if os.path.isfile(filename_stamp_plot): stamp = time_functions.read_time_stamp(filename_stamp_plot) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'plot results'), 'yellow')) else: # PLOT and SHOW results RNAbiotypes_stats = main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None') # create plot plt.figure(figsize=(16,8)) df_genetype_2 = pd.DataFrame({'Type':RNAbiotypes_stats[0], 'Count':RNAbiotypes_stats[1]}).sort_values(by=['Count']) ## get total count df_genetype_ReadCount_sum = df_genetype_2['Count'].sum() ## filter 1% values minimun = df_genetype_ReadCount_sum * 0.01 df_genetype_filter_greater = df_genetype_2[ df_genetype_2['Count'] >= minimun ] df_genetype_filter_smaller = df_genetype_2[ df_genetype_2['Count'] < minimun ] ## create %values df_genetype_2['Percentage'] = (df_genetype_2['Count']/df_genetype_ReadCount_sum*100).round(3) ## merge and generate Other class df_genetype_filter_smaller_sum = df_genetype_filter_smaller['Count'].sum() ## total filter smaller df_genetype_filter_greater2 = df_genetype_filter_greater.append({ 'Count':df_genetype_filter_smaller_sum, 'Type':'Other'}, ignore_index=True) ## Create Pie Plot ax1 = plt.subplot(121, aspect='equal') df_genetype_filter_greater2.plot.pie( y = 'Count', ax=ax1, autopct='%1.2f%%', shadow=False, labels=df_genetype_filter_greater2['Type'], legend = False) # plot table ax2 = plt.subplot(122) plt.axis('off') tbl = ax2.table( cellText=df_genetype_2.values, colLabels=df_genetype_2.columns, loc='center', rowLoc='left', cellLoc='center', ) tbl.auto_set_font_size(True) #tbl.set_fontsize(12) tbl.scale(1.1,1.1) ## set PDF name name_figure = os.path.join(folder, name + '_RNAbiotypes.pdf') ## generate image plt.savefig(name_figure) plt.close(name_figure) ## print time stamps time_functions.print_time_stamp(filename_stamp_plot) filename_stamp_all = folder + '/.success_all' time_functions.print_time_stamp(filename_stamp_all)
def edirect_ident(dataFrame, outdir_dict, Debug): """Connect to NCBI for information retrieval This functions uses the software edirect_ to connect to NCBI and retrieve some information regarding samples, assemblies, publications, etc. :param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :type dataFrame: pandas.DataFrame() :type outdir_dict: Dictionary :return: Information of the identification :rtype: pandas.DataFrame() See example of returned dataframe in file :file:`/devel/results/edirect_download_results.csv` here: .. include:: ../../devel/results/edirect_download_results.csv :literal: .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.get_info_file` - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.functions.print_time_stamp` - :func:`BacterialTyper.scripts.functions.optimize_threads` - :func:`BacterialTyper.scripts.functions.create_subfolder` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.scripts.functions.is_non_zero_file` - :func:`BacterialTyper.scripts.edirect_caller.generate_docsum_call` - :func:`BacterialTyper.scripts.edirect_caller.generate_xtract_call` .. include:: ../../links.inc """ ################################################ ## TODO: What to do if multi-isolate sample? ################################################ ## edirect HCGB_aes.boxymcboxface("EDirect information") print("+ Connect to NCBI to get information from samples identified...") ## create dataframe to return results edirect_frame = pd.DataFrame(columns=("sample", "genus", "species", "strain", "BioSample", "genome", "Plasmids")) ## debugging messages if Debug: print("*******************************************************") print("Dataframe sample_results: ") # Group dataframe sample name sample_results = dataFrame.groupby(["Sample"]) for name, grouped in sample_results: ## debugging messages if Debug: print("Name: ", name) print(grouped) ## use edirect to get Species_name and entry for later identification edirect_folder = HCGB_files.create_subfolder('edirect', outdir_dict[name]) ## chromosome match if (len(grouped.loc[grouped['Database'] == 'bacteria.ATG'] ['#Template']) == 0): if Debug: print("Name: ", name) print("No chromosome match identified by kmer") genus = '' species = '' BioSample_name = '' AssemblyAcc = '' else: nucc_entry = grouped.loc[grouped['Database'] == 'bacteria.ATG'][ '#Template'].values[0].split() ## e.g. NZ_CP029680.1 Staphylococcus aureus strain AR_0215 chromosome, complete genome ## out_docsum_file = edirect_folder + '/nuccore_docsum.txt' tmp_species_outfile = edirect_folder + '/info.csv' filename_stamp = edirect_folder + '/.success_species' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) status = True else: edirect_caller.generate_docsum_call('nuccore', nucc_entry[0], out_docsum_file) status = edirect_caller.generate_xtract_call( out_docsum_file, 'DocumentSummary', 'Organism,BioSample,AssemblyAcc,Strain', tmp_species_outfile) ######################################## ## get information from edirect call ######################################## if not status: print("NO INFORMATION") continue taxa_name_tmp = HCGB_main.get_info_file(tmp_species_outfile) Organism = taxa_name_tmp[0].split(',')[0].split() genus = Organism[0] ## genus species = Organism[1] ## species BioSample_name = taxa_name_tmp[0].split(',')[1] ## BioSample AssemblyAcc = taxa_name_tmp[0].split(',')[2] ## AssemblyAcc ## sometimes strain is missing if len(taxa_name_tmp[0].split(',')) > 3: strain = taxa_name_tmp[0].split(',')[3] ## strain else: strain = 'NaN' ## get GenBank accession ID out_docsum_file_assembly = edirect_folder + '/assembly_docsum.txt' AssemblyAcc_outfile = edirect_folder + '/AssemblyAcc.csv' edirect_caller.generate_docsum_call('assembly', AssemblyAcc, out_docsum_file_assembly) edirect_caller.generate_xtract_call(out_docsum_file_assembly, 'DocumentSummary', 'Genbank', AssemblyAcc_outfile) ## some error occurred if not HCGB_main.is_non_zero_file(out_docsum_file_assembly): continue ## Is it better to download Refseq or Genbank? ## https://www.quora.com/What-is-the-difference-between-Refseq-and-Genbank GenbankAcc = HCGB_main.get_info_file(AssemblyAcc_outfile) if Debug: print("Sample: ", name) print("Genbank Acc: ", GenbankAcc[0]) ## plasmid match group_plasmid = grouped.loc[grouped['Database'] == 'plasmids.T'] plasmid_entries = group_plasmid['#Template'].tolist() ## e.g. NZ_CP029083.1 Staphylococcus aureus strain AR464 plasmid unnamed1, complete sequence plasmid_entries_str = ",".join([i.split()[0] for i in plasmid_entries]) ## save edirect_frame #("sample", "taxa", strain, genome "BioSample", "Plasmids")) edirect_frame.loc[len(edirect_frame)] = (name, genus, species, strain, BioSample_name, GenbankAcc[0], plasmid_entries_str) stamp = HCGB_time.print_time_stamp(filename_stamp) ## debugging messages if Debug: print("*******************************************************") return (edirect_frame)
def MLST_ident(options, dataFrame, outdir_dict, dataFrame_edirect, retrieve_databases): """Generate MLST profile identification This functions uses the `MLSTar software`_ to retrieve Multi locus sequence typing (MLST) profiles from PubMLST_ for the given species previously identified by KMA. It generates MLST profiling for each sample. :param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in... :param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`. :param outdir_dict: dictionary containing information for each sample of the output folder for this process. :param dataFrame_edirect: pandas dataframe resulted from :func:`BacterialTyper.modules.ident.edirect_ident`. :param retrieve_databases: :type options: :type dataFrame: pandas.DataFrame() :type outdir_dict: Dictionary :type dataFrame_edirect: pandas.DataFrame() :type retrieve_databases: pandas.DataFrame() :return: Information of the MLST identification. Dictionary keys are samples and values are the absolute path to file generate by :func:`BacterialTyper.scripts.MLSTar.run_doMLST` containing MLST information. :rtype: Dictionary See example of returned dataframe in file :file:`/devel/results/doMLST_result_example.csv` here: .. include:: ../../devel/results/doMLST_result_example.csv :literal: .. seealso:: Additional information to PubMLST available datasets. - :doc:`PubMLST datasets<../../../data/PubMLST_datasets>` .. seealso:: This function depends on other ``BacterialTyper`` functions called: - :func:`BacterialTyper.scripts.functions.read_time_stamp` - :func:`BacterialTyper.scripts.functions.create_subfolder` - :func:`BacterialTyper.scripts.functions.boxymcboxface` - :func:`BacterialTyper.scripts.MLSTar.run_MLSTar` - :func:`HCGB.sampleParser.files.get_files` - :func:`BacterialTyper.scripts.MLSTar.get_MLSTar_species` .. include:: ../../links.inc """ ## set config rscript = set_config.get_exe("Rscript") ## TODO: Samples might not be assembled...to take into account and return 0 ## TODO: Fix and install MLSTar during installation print(MLSTar.get_MLSTar_package_installed()) exit() ######################################################################################## ## TODO: What to do if multi-isolate sample? ## TODO: Control if a different profile is provided via --MLST_profile ## TODO: Check time passed and download again if >?? days passed] ## debug message if (Debug): print(colored("**DEBUG: dataFrame_edirect identified**", 'yellow')) print(dataFrame_edirect) ## MLST call HCGB_aes.boxymcboxface("MLST typing") print( "+ Create classical MLST typification of each sample according to species retrieved by kmer..." ) ## get assembly files input_dir = os.path.abspath(options.input) assembly_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: assembly_samples_retrieved**", 'yellow')) print(assembly_samples_retrieved) # init MLST_results = {} ## get MLST_profile: default or provided mlst_profile_list = retrieve_databases.loc[retrieve_databases['db'] == 'PubMLST']['path'].tolist() if (Debug): print("** Debug **") print("mlst_profile_list") print(mlst_profile_list) print("dataFrame_edirect") print(dataFrame_edirect) ## Generate MLST call according to species identified for each sample for index, row in dataFrame_edirect.iterrows(): MLSTar_taxa_name = MLSTar.get_MLSTar_species(row['genus'], row['species']) if (MLSTar_taxa_name == 'NaN'): print( colored( "\t- Not available PubMLST profile for sample [%s] identified as %s %s" % (row['sample'], row['genus'], row['species']), 'yellow')) else: for mlst_profile in mlst_profile_list: ## species folder #species_mlst_folder = functions.create_subfolder(MLSTar_taxa_name, pubmlst_folder) species_mlst = mlst_profile.split(',')[0] species_mlst_folder = mlst_profile.split(',')[1] ## output file output_file = species_mlst_folder + '/PubMLST_available_scheme.csv' filename_stamp = species_mlst_folder + '/.success_scheme' ## if MLSTar_taxa_name == species_mlst: if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s" % stamp, 'yellow')) else: ### get scheme available MLSTar.getPUBMLST(MLSTar_taxa_name, rscript, output_file) stamp = HCGB_time.print_time_stamp(filename_stamp) ## parse and get scheme for classical MLST schemes_MLST = pd.read_csv(output_file, sep=',', header=0) ## for item, cluster in schemes_MLST.iterrows(): if cluster['len'] < 10: scheme2use = int(cluster['scheme']) continue ### sample = row['sample'] MLSTar_folder = HCGB_files.create_subfolder( 'MLST', outdir_dict[sample]) genome_file = assembly_samples_retrieved.loc[ assembly_samples_retrieved['name'] == sample]['sample'].values[0] ## call MLST (results, profile_folder) = MLSTar.run_MLSTar( species_mlst_folder, rscript, MLSTar_taxa_name, scheme2use, sample, MLSTar_folder, genome_file, options.threads) MLST_results[sample] = results ## print("+ Finish this step...") return (MLST_results)
def biotype_all(featureCount_exe, path, gtf_file, bam_file, name, threads, Debug, allow_multimap, stranded): ## folder for results if not os.path.isdir(path): files_functions.create_folder(path) out_file = os.path.join(path, 'featureCount.out') logfile = os.path.join(path, name + '_RNAbiotype.log') filename_stamp_all = path + '/.success_all' if os.path.isfile(filename_stamp_all): stamp = time_functions.read_time_stamp(filename_stamp_all) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'RNAbiotype'), 'yellow')) return() else: filename_stamp_featureCounts = path + '/.success_featureCounts' if os.path.isfile(filename_stamp_featureCounts): stamp = time_functions.read_time_stamp(filename_stamp_featureCounts) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'featureCounts'), 'yellow')) else: ## debugging messages if Debug: print ("** DEBUG:") print ("featureCounts system call for sample: " + name) print ("out_file: " + out_file) print ("logfile: " + logfile) ## send command for feature count ## Allow multimapping if allow_multimap: cmd_featureCount = ('%s -s %s -M -O -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %( featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile) ) else: cmd_featureCount = ('%s -s %s --largestOverlap -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %( featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile) ) ## system call cmd_featureCount_code = system_call_functions.system_call(cmd_featureCount, False, True) if not cmd_featureCount_code: print("** ERROR: featureCount failed for sample " + name) exit() ## print time stamp time_functions.print_time_stamp(filename_stamp_featureCounts) ## parse results (extended_Stats_file, RNAbiotypes_stats_file) = parse_featureCount(out_file, path, name, bam_file, Debug) ## debugging messages if Debug: print ("** DEBUG:") print ("extended_Stats: " + extended_Stats_file) print (main_functions.get_data(extended_Stats_file, '\t', 'header=None')) print ("RNAbiotypes_stats: " + RNAbiotypes_stats_file) print (main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None')) return ()
def run_annotation(options): ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option sampleParser.help_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() elif (options.help_Prokka): ## information for Prokka annotation.print_list_prokka() exit() ## set default options.batch = False ### HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly annotation") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ### symbolic links print("+ Retrieve all genomes assembled...") ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "assembly", ["fna"], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) ## for samples outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "annot", options.debug) ## annotate print("+ Annotate assemblies using prokka:") print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode") if options.genera == 'Other': print( "\t-Option: genera = Off; No genus-specific BLAST databases option provided" ) else: print("\t-Option: genera = ", options.genera, "; Genus-specific BLAST databases option provided") print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature") print("\t-Option: addmrna; Add 'mRNA' features for each 'CDS' feature") print("\t-Option: cdsrnaolap; Allow [tr]RNA to overlap CDS") ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(annot_caller, row['sample'], outdir_dict[row['name']], options, row['name'], threads_job): index for index, row in pd_samples_retrieved.iterrows() } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## get folders givenList = [v for v in outdir_dict.values()] protein_files = [] print( "+ Detail information for each sample could be identified in separate folders:" ) for folder in givenList: print('\t + ', folder) protein_files.extend( HCGB_main.retrieve_matching_files(folder, '.faa', Debug)) ### report generation if (options.skip_report): print("+ No annotation report generation...") else: ### report generation HCGB_aes.boxymcboxface("Annotation report") outdir_report = HCGB_files.create_subfolder("report", outdir) PROKKA_report = HCGB_files.create_subfolder("annotation", outdir_report) print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## check if previously report generated filename_stamp = PROKKA_report + '/.success' done = 0 if os.path.isdir(PROKKA_report): if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous report generated results on: %s" % stamp, 'yellow')) done = 1 ## generate report if done == 0: ## get subdirs generated and call multiQC report module multiQC_report.multiQC_module_call(givenList, "Prokka", PROKKA_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % PROKKA_report) ## success stamps filename_stamp = PROKKA_report + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) ## time stamp start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total) ## Check each annotation using BUSCO results = qc.BUSCO_check(input_dir, outdir, options, start_time_partial_BUSCO, "proteins") ## print to file: results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Annotation module.") return ()
def trimmo_call(java_path, sample_folder, sample_name, files, trimmomatic_jar, threads, trimmomatic_adapters, Debug): ## ## Function to call trimmomatic using java. Can take single-end and pair-end files ## sample_folder must exists before calling this function. ## It can be call from main or a module. ## Returns code OK/FAIL according if succeeded or failed the system call ## ####################################### ## http://www.usadellab.org/cms/?page=trimmomatic # # ILLUMINACLIP:fasta_file.fa:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 # # This will perform the following: # Remove adapters (ILLUMINACLIP:fasta_file.fa:2:30:10) # Remove leading low quality or N bases (below quality 11) (LEADING:11) # Remove trailing low quality or N bases (below quality 11) (TRAILING:11) # Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 20 (SLIDINGWINDOW:4:20) # Drop reads below the 24 bases long (MINLEN:24) ####################################### ## debug message if (Debug): print (colored("+ Cutting adapters for sample: " + sample_name, 'yellow')) ## log files log_file = sample_folder + '/' + sample_name + '_call.log' trimmo_log = sample_folder + '/' + sample_name + '.log' ## init file_R1 = "" file_R2 = "" trim_R1 = "" orphan_R1 = "" trim_R2 = "" orphan_R2 = "" ## conda installation includes a wrapper and no java jar call is required if trimmomatic_jar.endswith('jar'): cmd = "%s -jar %s" %(java_path, trimmomatic_jar) else: cmd = "%s" %(trimmomatic_jar) ## Paired or single end ## set command if (len(files) == 2): ## paired-end file_R1 = files[0] file_R2 = files[1] #print ('\t-', file_R2) trim_R1 = sample_folder + '/' + sample_name + '_trim_R1.fastq' orphan_R1 = sample_folder + '/' + sample_name + '_orphan_R1.fastq' trim_R2 = sample_folder + '/' + sample_name + '_trim_R2.fastq' orphan_R2 = sample_folder + '/' + sample_name + '_orphan_R2.fastq' cmd = cmd + " PE -threads %s -trimlog %s %s %s %s %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 2> %s" %(threads, log_file, file_R1, file_R2, trim_R1, orphan_R1, trim_R2, orphan_R2, trimmomatic_adapters, trimmo_log) else: ## single end file_R1 = files[0] trim_R1 = sample_folder + '/' + sample_name + '_trim.fastq' cmd = cmd + " SE -threads %s -trimlog %s %s %s ILLUMINACLIP:%s:2:30:10 LEADING:11 TRAILING:11 SLIDINGWINDOW:4:20 MINLEN:24 2> %s" %(threads, log_file, file_R1, trim_R1, trimmomatic_adapters, trimmo_log) ## system call & return code = HCGB_sys.system_call(cmd) if code == 'OK': ## success stamps filename_stamp = sample_folder + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) return('OK') else: return('FAIL')
def update_sample(name, cluster, own_data, user_data_db, Debug): ## debug message if (Debug): print( colored("**DEBUG: sample_frame groupby: name & cluster **", 'yellow')) print(name) print(cluster) if (name == 'report'): return () print('\t+ Sending command for sample: ', name) ############################################ #### check information for this sample ############################################ ## generate sample dir_sample = HCGB_files.create_subfolder(name, own_data) if name in user_data_db.index: print( colored( "\t\t+ Data available in database for sample: %s. Checking integrity..." % name, 'yellow')) #functions.print_sepLine("+", 75, False) ## data to generate data2dump = pd.DataFrame(columns=('ID', 'folder', 'genus', 'species', 'name', 'genome', 'GFF', 'proteins', 'signature', 'profile', 'ident', 'reads')) ## iterate over files with different tags: reads, annot, assembly, profile, ident ########## ## assembly ########## assembly_dir = HCGB_files.create_subfolder('assembly', dir_sample) assembly_file = cluster.loc[cluster['tag'] == 'assembly']['sample'].to_list() if assembly_file: assembly_file_name = os.path.basename(assembly_file[0]) genome = assembly_dir + '/' + assembly_file_name if not os.path.exists(genome): shutil.copy(assembly_file[0], assembly_dir) else: genome = "" ########## ## annot ########## annot_dir = HCGB_files.create_subfolder('annot', dir_sample) annot_files = cluster.loc[cluster['tag'] == 'annot']['sample'].to_list() prof = "" gff = "" if annot_files: for f in annot_files: file_name = os.path.basename(f) if f.endswith('faa'): prot = annot_dir + '/' + file_name if os.path.exists(prot): continue elif f.endswith('gff'): gff = annot_dir + '/' + file_name if os.path.exists(gff): continue shutil.copy(f, annot_dir) else: gff = "" prot = "" ########## ## trimm ########## trimm_dir = HCGB_files.create_subfolder('trimm', dir_sample) reads_files = cluster.loc[cluster['tag'] == 'reads']['sample'].to_list() reads = [] if reads_files: for f in reads_files: file_name = os.path.basename(f) reads_name = trimm_dir + '/' + file_name reads.append(reads_name) if not os.path.exists(reads_name): shutil.copy(f, trimm_dir) ########## ## ident ########## ident_dir = HCGB_files.create_subfolder('ident', dir_sample) ident_file = cluster.loc[cluster['tag'] == 'ident']['sample'].to_list() if ident_file: file_name = os.path.basename(ident_file[0]) ident_file_name = ident_dir + '/' + file_name if not os.path.exists(ident_file_name): shutil.copy(ident_file[0], ident_dir) else: ident_file_name = "" ########## ## profile ########## profile_dir = HCGB_files.create_subfolder('profile', dir_sample) profile_files = cluster.loc[cluster['tag'] == 'profile']['sample'].to_list() profile_file = [] if profile_files: for f in profile_files: file_name = os.path.basename(f) profile_file_name = profile_dir + '/' + file_name profile_file.append(profile_file_name) if not os.path.exists(profile_file_name): shutil.copy(f, profile_dir) ########## ## mash profile ########## mash_dir = HCGB_files.create_subfolder('mash', dir_sample) mash_file = cluster.loc[cluster['tag'] == 'mash']['sample'].to_list() if mash_file: file_name = os.path.basename(mash_file[0]) sig_file = mash_dir + '/' + file_name if not os.path.exists(sig_file): shutil.copy(mash_file[0], mash_dir) else: sig_file = "" ############################################ ### Dump information ## TODO: Add species and genus information when parsed from ident csv file ##### data2dump.loc[len(data2dump)] = (name, dir_sample, 'genus', 'species', name, genome, gff, prot, sig_file, '::'.join(sorted(profile_file)), ident_file_name, '::'.join(sorted(reads))) #data2dump = data2dump.set_index('ID') ###### dump to file info_file = dir_sample + '/info.txt' data2dump.to_csv(info_file) ###### dump file information to file info_file2 = dir_sample + '/info_files.txt' cluster.to_csv(info_file2) ###### timestamp filename_stamp = dir_sample + '/.success' stamp = HCGB_time.print_time_stamp(filename_stamp) return ()