def parse_annot_file(name, folder_out_input, annot_file, output_path, Debug, ref_file=""): """ This functions checks for each annotation file provided type of input and calls appropriate parser: gbf_parser or gff_parser """ ## debug messages if (Debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('check_annot_file function call:', 'yellow') debug_message('name: ' + name, 'yellow') debug_message('annot_file: ' + annot_file, 'yellow') ## check file integrity: exists & non-zero if (BacDup_functions.file_readable_check(annot_file)): ## check format; call parser format = format_checker.is_format(annot_file, Debug) ## debug messages if (Debug): debug_message('\nformat_checker.is_format function call:', 'yellow') debug_message('format: ' + format, 'yellow') ## parse gbk or gff if (format == 'gbk'): print(colored('\t* GenBank format file:........[OK]', 'green')) ## TODO: print details available within GenBank: # Accession, Bioproject, # Reference, Authors, Title, Journal, # Comment return (gbf_parser.gbf_parser_caller(annot_file, output_path, Debug)) elif (format == 'gff'): print(colored('\t* GFF format file:.......[OK]', 'green')) if (HCGB_files.is_non_zero_file(ref_file)): return (gff_parser.gff_parser_caller(annot_file, ref_file, output_path, Debug)) else: print( colored( "ERROR: No genome reference file provided for this GFF annotation. Check input options provided.", "red")) exit() ## not valid via this option else: print(colored("ERROR: not valid via this option", "red")) exit() ## not accessible for this sample else: return (False)
def main(): ## ARGV if len (sys.argv) < 6: print ("\nUsage:") print ("python3 %s bam_file folder gtf_file threads name featureCount_bin multimapping[True/False]\n" %os.path.realpath(__file__)) exit() bam_file = os.path.abspath(argv[1]) folder = os.path.abspath(argv[2]) gtf_file = os.path.abspath(argv[3]) threads = argv[4] name = argv[5] featureCount_exe = argv[6] multimapping= argv[7] ## Debug Debug=True ## variables biotype_all(featureCount_exe, folder, gtf_file, bam_file, name, threads, Debug, multimapping) ## plot results RNAbiotypes_stats_file = os.path.join(folder, name + '_RNAbiotype.tsv') if files_functions.is_non_zero_file(RNAbiotypes_stats_file): pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug)
def R_package_path_installed(): """Provides absolute path to file ``R_package.info.txt`` containing path to missing R packages installed""" ## check if exists or try to install RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'R', 'R_package.info.txt') if HCGB_files.is_non_zero_file(RDir_package): list = HCGB_main.readList_fromFile(RDir_package) return (list[0]) else: path2install = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'R', 'install_packages') HCGB_files.create_folder(path2install) return (path2install)
def RNAbiotype_module_call(samples_dict, output_dict, gtf_file, Debug, max_workers_int, threads_job): """ Create RNAbiotype analysis for each sample and create summary plots :param samples_dict: Dictionary containing sample IDs as keys and bam files as values :param output_dict: Dictionary containing sample IDs as keys and output folder as values :param gtf_file: Gene annotation file for the reference genome used. :param threads: Number of threads to use. :param Debug: True/False for debugging messages """ ## get bin featureCount_exe = set_config.get_exe('featureCounts') ## send for each sample with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: commandsSent = { executor.submit(biotype_all, featureCount_exe, output_dict[sample], gtf_file, bam_files, sample, threads_job, Debug): sample for sample, bam_files in samples_dict.items() } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## ## plot results for name, folder in output_dict.items(): RNAbiotypes_stats_file = os.path.join(folder, name + '_RNAbiotype.tsv') if files_functions.is_non_zero_file(RNAbiotypes_stats_file): pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug) return ()
def parse_featureCount(out_file, path, name, bam_file, Debug): """ Parses featureCount results for RNAbiotype analysis. :param out_file: Name provided to featureCount for output results. :param path: :param name: """ ## file names out_tsv_file_name = out_file + '.tsv' RNA_biotypes_file_name = os.path.join(path, name + '_RNAbiotype.tsv') ## filename_stamp_parse = path + '/.success_parse' if os.path.isfile(filename_stamp_parse): stamp = time_functions.read_time_stamp(filename_stamp_parse) print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'parse results'), 'yellow')) else: ## debugging messages if Debug: print ("** DEBUG:") print ("Parse results for sample: " + name) ## parse results out_tsv_file = open(out_tsv_file_name, 'w') RNA_biotypes_file = open(RNA_biotypes_file_name, 'w') tRNA_count = 0 ########################################## ### read count file ########################################## count_file = open(out_file) count_file_text = count_file.read() count_file_lines = count_file_text.splitlines() for line in count_file_lines: if line.startswith('#'): continue elif line.startswith('Geneid'): continue else: ID = line.split('\t')[0] count = int(line.split('\t')[-1]) string2write_raw = "%s\t%s\n" %(ID, count) out_tsv_file.write(string2write_raw) tRNA_search = re.search(r".*tRNA", ID) if tRNA_search: tRNA_count = int(tRNA_count) + int(count) elif (count > 0): RNA_biotypes_file.write(string2write_raw) ## count and summary tRNA string2write = "tRNA\t%s\n" %tRNA_count RNA_biotypes_file.write(string2write) RNA_biotypes_file.close() ########################################## ### read summary count file ########################################## summary_count_file = open(out_file + '.summary') summary_count_file_text = summary_count_file.read() summary_count_file_lines = summary_count_file_text.splitlines() for line in summary_count_file_lines: if line.startswith('Status'): continue elif line.startswith('Assigned'): continue else: ## adds Unassigned_Ambiguity ## adds Unassigned_NoFeatures ID = line.split('\t')[0] count = int(line.split('\t')[-1]) ## skip empty entries if count == 0: continue string2write_raw = "%s\t%s\n" %(ID, count) out_tsv_file.write(string2write_raw) ########################################## ## get mapping statistics according to mapping software ########################################## count_multi = 0 count_unmap = 0 mapping_folder = os.path.dirname(bam_file) mapping_stats = mapping_folder + '/Log.final.out' ## -------------------------------- ## ### STAR mapping ## -------------------------------- ## if files_functions.is_non_zero_file(mapping_stats): ## debugging messages if Debug: print ("** DEBUG:") print ("STAR mapping available for sample: " + name) print ("mapping_folder: " + mapping_folder) mapping_stats_file = open(mapping_stats) mapping_stats_file_text = mapping_stats_file.read() mapping_stats_file_lines = mapping_stats_file_text.splitlines() for line in mapping_stats_file_lines: multi_search = re.search(r".*Number of reads mapped to", line) unmap_search = re.search(r".*unmapped.*", line) input_search = re.search(r".*input reads.*", line) if input_search: total_input_reads = int(line.split('\t')[-1]) if multi_search: count_tmp = int(line.split('\t')[-1]) count_multi = count_multi + count_tmp elif unmap_search: perc_tmp = line.split('\t')[-1] count_reads = math_functions.percentage(perc_tmp, total_input_reads) count_unmap = count_unmap + count_reads else: ## -------------------------------- ## ## tophat ## -------------------------------- ## mapping_stats = mapping_folder + '/align_summary.txt' count_map = 0 total_input_reads = 0 if files_functions.is_non_zero_file(mapping_stats): ## debugging messages if Debug: print ("** DEBUG:") print ("tophat mapping available for sample: " + name) print ("mapping_folder: " + mapping_folder) mapping_stats_file = open(mapping_stats) mapping_stats_file_text = mapping_stats_file.read() mapping_stats_file_lines = mapping_stats_file_text.splitlines() for line in mapping_stats_file_lines: map_search2 = re.search(r"Aligned.*\:\s+(\d+).*", line) input_search2 = re.search(r".*Input.*\:\s+(\d+).*", line) if input_search2: total_input_reads = input_search2.group(1) if map_search2: count_map = map_search2.group(1) #### count_unmap = int(total_input_reads) - int(count_map) else: ## other print ("Neither tophat or STAR..., no mapping statistics") ### print mapping stats string2write_unmap = "unmapped\t%s\n" %count_unmap out_tsv_file.write(string2write_unmap) ## close files out_tsv_file.close() ## print timestamp time_functions.print_time_stamp(filename_stamp_parse) return(out_tsv_file_name, RNA_biotypes_file_name)
def results_parser(database, folderResults, sampleName, outfolder, assembly_cutoff, card_trick_info): """Parse ARIBA results This function basically extracts files and generated additionally information for later parse according to type of database provided. .. seealso:: Additional information to ARIBA results generated. - :ref:`ARIBA-explained` """ if not os.path.exists(folderResults): print( "+ Finish parsing information for sample [%s]. Results folder does not exist." % sampleName) return ('NaN', 'NaN') ## get files list_files = os.listdir(folderResults) ## init assemblies = "" assemled_genes = "" fileResults = "" print("\n+ Parsing result file for sample: ", sampleName) ## extract files print("\n+ Extracting files if necessary:") for f in list_files: filePath = os.path.join(folderResults, f) if f.endswith('.gz'): HCGB_files.extract(filePath, folderResults) if (f == 'report.tsv'): fileResults = filePath elif (f == 'assemblies.fa.gz'): assemblies = os.path.join(folderResults, 'assemblies.fa') elif (f == 'assembled_genes.fa.gz'): assemled_genes = os.path.join(folderResults, 'assembled_genes.fa') print("\n") ## no results generated if not HCGB_files.is_non_zero_file(fileResults): print('+ No results generated for sample: ', sampleName) return ('', '') ### expand flags flagResults = folderResults + '/flags_explain.tsv' fileFlags = ariba_caller.ariba_expandflag(fileResults, flagResults) ###################### ## generate summary ###################### ## ## ariba has function that generates a summary for samples ## summary_results_tmp = folderResults + '/report_summary_tmp' summary_results = folderResults + '/report_summary.csv' options = "--no_tree" ## Info ## https://github.com/sanger-pathogens/ariba/wiki/The-assembled-column-from-ariba-summary ariba_caller.ariba_summary(summary_results_tmp, [fileResults], options) ## fix names: just for aesthetics fake_dict = {sampleName: fileResults} ariba_caller.fix_ariba_summary(summary_results_tmp + '.csv', summary_results, fake_dict) os.remove(summary_results_tmp + '.csv') ############################################ ### check results according to database ############################################ if (database == 'vfdb_full'): (name_excel, name_csv) = parse_vfdb(outfolder, sampleName, fileResults, fileFlags, summary_results, assembly_cutoff) elif (database == 'card'): (name_excel, name_csv) = parse_card(outfolder, sampleName, fileResults, fileFlags, summary_results, assembly_cutoff, card_trick_info) else: ## [TODO] check results according to databases different than CARD/VFDB (name_excel, name_csv) = parse_results(outfolder, sampleName, fileResults, fileFlags, summary_results) print('\tCheck additional information on ', name_excel) ## print success timestamp filename_stamp = outfolder + '/.success_' + database stamp = HCGB_time.print_time_stamp(filename_stamp) return (name_excel, name_csv)
def agrvate_call(sample, assembly_file, folder, debug=False): """agrvate call and check results.""" ## prepare call log_call = os.path.join(folder, "agrvate_cmd.log") err_call = os.path.join(folder, "agrvate_cmd.err") agrvate_bin = set_config.get_exe('agrvate') ## system call cmd_call = "%s -i %s -m -f > %s 2> %s " %(agrvate_bin, assembly_file, log_call, err_call) ## use mummer (-m) and force results folder (-f) status = HCGB_sys.system_call(cmd_call) ## check results ## see https://github.com/VishnuRaghuram94/AgrVATE#results for additional details results = pd.DataFrame() ## check folder is created assembly_file_name = os.path.basename(assembly_file).split('.fna')[0] original_results_folder = os.path.join(folder, assembly_file_name + '-results') results_folder = os.path.join(folder, 'agrvate_results') if os.path.isdir(original_results_folder): print("+ Results folder generated OK") print("+ Check results generated:") ## rename folder os.rename(original_results_folder, results_folder) os.rename(os.path.join(folder, assembly_file_name + '.fna-error-report.tab'), os.path.join(results_folder, 'error_report.tab')) ## write to excel file_name_Excel = os.path.join(folder, sample + '_agr_results.xlsx') writer_Excel = pd.ExcelWriter(file_name_Excel, engine='xlsxwriter') ## open excel handle ## get all files list_files = HCGB_main.get_fullpath_list(results_folder) ## summary tab summary_tab_file = [s for s in list_files if s.endswith("summary.tab")][0] summary_tab = HCGB_main.get_data(summary_tab_file, '\t', options="") summary_tab['sample'] = sample ## columns #agr_group: gp1/gp2/gp3/gp4. 'u' means unknown. ## If multiple agr groups were found (col 5 = m), ## the displayed agr group is the majority/highest confidence. # match_score: maximum 15; 0 means untypeable; < 5 means low confidence. # canonical_agrD: 1 means canonical; 0 means non-canonical; u means unknown. # multiple_agr: s means single, m means multiple, u means unknown ) ## Multiple groups are found likely due to multiple S. aureus isolates in sequence # frameshifts: Number found in CDS of extracted agr operon ('u' if agr operon not extracted) ## debug messages if debug: HCGB_aes.debug_message("agrvate results: Summary tab file", 'yellow') print(summary_tab_file) print(summary_tab) ## add summary results to all results del summary_tab['#filename'] results = summary_tab.copy() ## save summary_tab into excel ## tab summary summary_tab.to_excel(writer_Excel, sheet_name='summary') ## write excel handle ## agr_gp tab agr_gp_tab_file = [s for s in list_files if s.endswith("agr_gp.tab")][0] if HCGB_files.is_non_zero_file(agr_gp_tab_file): agr_gp_tab = HCGB_main.get_data(agr_gp_tab_file, '\t', options='header=None') agr_gp_tab.columns = ['contig', 'agr', 'evalue', 'identity', 'start', 'end'] agr_gp_tab['sample'] = sample ## columns ## Assembly Contig ID ## ID of matched agr group kmer ## evalue ## Percentage identity of match ## Start position of kmer alignment on input sequence ## End position of kmer alignment on input sequence ## debug messages if debug: HCGB_aes.debug_message("agrvate results: agr_gp file", 'yellow') print(agr_gp_tab_file) print(agr_gp_tab) ## save agr_gp_tab file into excel ## tab operon agr_gp_tab.to_excel(writer_Excel, sheet_name='operon') ## write excel handle ## agr_operon fna try: agr_operon_fna_file = [s for s in list_files if s.endswith("agr_operon.fna")][0] ## debug messages if debug: HCGB_aes.debug_message("agrvate results: agr_operon file", 'yellow') print(agr_operon_fna_file) results['operon_fna'] = agr_operon_fna_file except: results['operon_fna'] = '' ## agr_operon fna error_report_file = [s for s in list_files if s.endswith("error_report.tab")][0] error_report = HCGB_main.get_data(error_report_file, '\t', options="") del error_report['#input_name'] ## debug messages if debug: HCGB_aes.debug_message("agrvate results: error_report.tab file", 'yellow') print(error_report_file) print(error_report) ## save error_report file into excel ## tab steps error_report.to_excel(writer_Excel, sheet_name='steps') ## write excel handle ## merge results results = pd.concat([results, error_report], axis=1) ## close xlsx file writer_Excel.save() ## close excel handle ## add to pandas dataframe results['agr_operon_xlsx'] = file_name_Excel ## debug messages if debug: HCGB_aes.debug_message("agrvate results", 'yellow') HCGB_main.print_all_pandaDF(results) return (results)
def get_reference_gbk(options): #################### ## Genbank_ID #################### reference_gbk_file = "" if options.Genbank_ID: db_frame_ncbi = database_generator.getdbs('NCBI', options.database, 'genbank', options.debug) ## debug message if (options.debug): print (colored("**DEBUG: db_frame_ncbi **", 'yellow')) print (db_frame_ncbi) NCBI_folder = HCGB_files.create_subfolder('NCBI', options.database) dir_path = os.path.join(NCBI_folder, 'genbank', 'bacteria', options.Genbank_ID) if (options.Genbank_ID in db_frame_ncbi.index): print('\t+ Reference (%s) available in database provided' %options.Genbank_ID) else: print ('\t+ Reference (%s) is not available in database provided' %options.Genbank_ID) print ('\t+ Try to download it.') database_generator.ngd_download(dir_path, options.Genbank_ID, NCBI_folder) ## get files download (genome, prot, gff, gbk) = database_generator.get_files_download(dir_path) if options.debug: print (colored("**DEBUG: genome:" + genome, 'yellow')) print (colored("**DEBUG: prot:" + prot, 'yellow')) print (colored("**DEBUG: gff:" + gff, 'yellow')) print (colored("**DEBUG: gbk:" + gbk, 'yellow')) if HCGB_files.is_non_zero_file(gbk): print('\t+ Genbank file format reference available.') reference_gbk_file = gbk else: print(colored('\n+ No genbank file available for the reference specified. Some error occurred while downloading', 'red')) exit() #################### ## user_sample_ID #################### elif options.user_sample_ID: db_frame_user_Data = database_user.get_userData_files(options, os.path.join(options.database, 'user_data')) df_data = db_frame_user_Data.groupby('name') try: this_sample_df = df_data.get_group(options.project_sample_ID) print('\t+ Reference (%s) available in database folder provided' %options.user_sample_ID) except: print (colored('** WARNING: Reference (%s) not available in database folder provided' %options.user_sample_ID, 'yellow')) print ('\t+ Lets try to update the database first.') db_frame_user_dataUpdated = database_user.update_database_user_data(options.database, input_dir, options.debug, options) df_data = db_frame_user_dataUpdated.groupby('name') try: this_sample_df = df_data.get_group(options.user_sample_ID) print('\t+ Reference (%s) available in database updated' %options.user_sample_ID) db_frame_user_Data = db_frame_user_dataUpated except: print(colored('\n** ERROR: No reference (%s) available in database updated. Some error occurred...' %options.user_sample_ID, 'red')) exit() ## debug message if (options.debug): print (colored("**DEBUG: db_frame_user_Data **", 'yellow')) print (db_frame_user_Data) print (colored("**DEBUG: this_sample_df (groupby name)**", 'yellow')) print (this_sample_df) ## get gbk file gbk = this_sample_df.loc[ this_sample_df['ext']=='gbf','sample'].values[0] ## debug if options.debug: print ("** DEBUG: this_sample_df") print (this_sample_df) print ('gbk:' + gbk) ## check if exists if HCGB_files.is_non_zero_file(gbk): print('\t+ Genbank file format reference available.') reference_gbk_file = gbk else: print(colored('\n** ERROR: No genbank file available for the reference specified. Some error occurred while downloading', 'red')) exit() #################### ## project_sample_ID #################### elif options.project_sample_ID: db_frame_project_Data = database_user.get_userData_files(options, options.input) df_data = db_frame_project_Data.groupby('name') try: this_sample_df = df_data.get_group(options.project_sample_ID) print('\t+ Reference (%s) available in project folder provided' %options.project_sample_ID) except: print (colored('** ERROR: Reference (%s) not available in project folder provided' %options.project_sample_ID, 'red')) print ('\t+ Check the spelling or provide a valid ID.') exit() ## debug message if (options.debug): print (colored("**DEBUG: db_frame_project_Data **", 'yellow')) print (db_frame_project_Data) print (colored("**DEBUG: this_sample_df (groupby name)**", 'yellow')) print (this_sample_df) ## get gbk file gbk = this_sample_df.loc[ this_sample_df['ext']=='gbf','sample'].values[0] ## debug if options.debug: print ("** DEBUG: this_sample_df") print (this_sample_df) print ('gbk:' + gbk) ## check if exists if HCGB_files.is_non_zero_file(gbk): print('\t+ Genbank file format reference available.') reference_gbk_file = gbk else: print(colored('\n** ERROR: No genbank file available for the reference specified. Some error occurred while downloading', 'red')) exit() #################### ## user_ref #################### elif options.user_ref: options.user_ref = os.path.abspath(options.user_ref) if HCGB_files.is_non_zero_file(options.user_ref): print('\t+ Reference provided via --user_ref is available and ready to use.') else: print('\n** ERROR: Reference provided via --user_ref not available or accessible.') print(colored('\n+ Check the path or integrity of the file. Some error occurred...', 'red')) exit() reference_gbk_file = options.user_ref return (reference_gbk_file)
def parse_information(arg_dict, df_accID, outdir): ### Parse df_accID dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "input", arg_dict.debug) dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "parse", arg_dict.debug) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') print("dict_input_folders") print(dict_input_folders) print("dict_parse_folders") print(dict_parse_folders) ## parse each sample retrieved for sample, folder_input in dict_input_folders.items(): if (arg_dict.debug): debug_message('sample: ' + sample, 'yellow') debug_message('folder_input: ' + folder_input, 'yellow') debug_message('folder_parse: ' + dict_parse_folders[sample], 'yellow') debug_message('annot_file: ' + df_accID.loc[sample, 'annot_file'], 'yellow') debug_message('genome' + df_accID.loc[sample, 'genome'], 'yellow') ## timestamps input_timestamp = os.path.join(folder_input, '.success') parse_timestamp = os.path.join(dict_parse_folders[sample], '.success') print() print("\t+ Parsing sample: " + sample) if (not HCGB_files.is_non_zero_file(parse_timestamp) and not HCGB_files.is_non_zero_file(input_timestamp)): ## TODO: Set threads to use in parallel process_OK = parse_annot_file(sample, folder_input, df_accID.loc[sample, 'annot_file'], dict_parse_folders[sample], arg_dict.debug, df_accID.loc[sample, 'genome']) if (process_OK): ## link or copy annotation file into folder_input HCGB_files.get_symbolic_link_file( df_accID.loc[sample, 'annot_file'], folder_input) ## add df_accID.loc[sample,] information as csv into input folder df_accID.loc[sample, ].to_csv(os.path.join( folder_input, 'info.csv'), index=True, header=True) ## print time stamp HCGB_time.print_time_stamp(input_timestamp) ## print time stamp HCGB_time.print_time_stamp(parse_timestamp) else: print( colored( "\t+ Some error occurred for sample %s while parsing input options" % sample, 'red')) ## print time stamp HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail')) ## print time stamp HCGB_time.print_time_stamp( os.path.join(dict_parse_folders[sample], '.fail')) else: read_time = HCGB_time.read_time_stamp(parse_timestamp) print( colored( "\t+ Input parsing already available for sample %s [%s]" % (sample, read_time), 'green')) print()
def run_biotype(options): ## init time start_time_total = time.time() ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_XICRA.help_fastq_format() elif (options.help_project): ## information for project help_XICRA.project_help() exit() elif (options.help_RNAbiotype): ## information for join reads RNAbiotype.help_info() exit() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True aesthetics_functions.pipeline_header('XICRA') aesthetics_functions.boxymcboxface("RNA biotype analysis") print("--------- Starting Process ---------") time_functions.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## set mode: project/detached if (options.detached): outdir = os.path.abspath(options.output_folder) options.project = False else: options.project = True outdir = input_dir ## get files print('+ Getting files from input folder... ') ## get files if options.noTrim: print('+ Mode: fastq.\n+ Extension: ') print("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) else: print('+ Mode: trim.\n+ Extension: ') print("[ _trim_ ]\n") pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## Discard if joined reads: use trimmed single-end or paired-end pd_samples_retrieved = pd_samples_retrieved[ pd_samples_retrieved['ext'] != '_joined'] ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: files_functions.create_folder(outdir) ## for samples mapping_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "map", options.debug) ## debug message if (Debug): print(colored("**DEBUG: mapping_outdir_dict **", 'yellow')) print(mapping_outdir_dict) # time stamp start_time_partial = time_functions.timestamp(start_time_total) ## optimize threads name_list = set(pd_samples_retrieved["new_name"].tolist()) threads_job = main_functions.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): print( colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) print( colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) print( colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) ############################################## ## map Reads ############################################## start_time_partial = mapReads_module(options, pd_samples_retrieved, mapping_outdir_dict, options.debug, max_workers_int, threads_job, start_time_partial, outdir) ## debug message if (Debug): print(colored("**DEBUG: mapping_results **", 'yellow')) print(mapping_results) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) ## for samples biotype_outdir_dict = files_functions.outdir_project( outdir, options.project, pd_samples_retrieved, "biotype", options.debug) ## debug message if (Debug): print(colored("**DEBUG: biotype_outdir_dict **", 'yellow')) print(biotype_outdir_dict) ## get RNAbiotype information RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict, options.annotation, options.debug, max_workers_int, threads_job) # time stamp start_time_partial = time_functions.timestamp(start_time_partial) if (options.skip_report): print("+ No report generation...") else: print( "\n+ Generating a report using MultiQC module for featureCount analysis." ) outdir_report = files_functions.create_subfolder("report", outdir) ## get subdirs generated and call multiQC report module givenList = [] print( "+ Detail information for each sample could be identified in separate folders:" ) ## call multiQC report module givenList = [v for v in biotype_outdir_dict.values()] my_outdir_list = set(givenList) ## debug message if (Debug): print( colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) print(my_outdir_list) print("\n") featureCount_report = files_functions.create_subfolder( "featureCount", outdir_report) multiQC_report.multiQC_module_call(my_outdir_list, "featureCount", featureCount_report, "-dd 2") print( '\n+ A summary HTML report of each sample is generated in folder: %s' % featureCount_report) ### Summarizing RNA biotype information biotype_report = files_functions.create_subfolder( "biotype", outdir_report) single_files_biotype = files_functions.create_subfolder( "samples", biotype_report) ## results dict_files = {} for samples in biotype_outdir_dict: featurecount_file = os.path.join(biotype_outdir_dict[samples], 'featureCount.out.tsv') if files_functions.is_non_zero_file(featurecount_file): dict_files[samples] = featurecount_file ## copy pdf pdf_plot = main_functions.retrieve_matching_files( biotype_outdir_dict[samples], '.pdf', options.debug) if files_functions.is_non_zero_file(pdf_plot[0]): shutil.copy(pdf_plot[0], single_files_biotype) ## collapse all information all_data = RNAbiotype.generate_matrix(dict_files) ## print into excel/csv print('+ Table contains: ', len(all_data), ' entries\n') ## debugging messages if Debug: print("** DEBUG: all_data") print(all_data) ## set abs_csv_outfile to be in report folder ## copy or link files for each sample analyzed abs_csv_outfile = os.path.join(biotype_report, "summary.csv") all_data.to_csv(abs_csv_outfile) ## create plot: call R [TODO: implement in python] outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf") ## R scripts biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum', options.debug) rscript = set_config.get_exe("Rscript", options.debug) cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script, abs_csv_outfile, outfile_pdf) ## print("+ Create summary plot for all samples") callCode = system_call_functions.system_call(cmd_R_plot) print("\n*************** Finish *******************") start_time_partial = time_functions.timestamp(start_time_total) print("\n+ Exiting join module.") return ()
def run_prep(options): """ Main function of the prep module. This module prepares fastq files for later usage. It initially checks the length of the name and advises the user to rename samples if exceeded. Along ``BacterialTyper`` there are a few string length limitations by different software that need to be sort out from the beginning of the process. This module allows to user to copy files into the project folder initiate or only link using a symbolic link to avoid duplicated raw data. See additional details of this module in user_guide :ref:`prep module entry<prep-description>`. .. seealso:: This function depends on other HCGB functions called: - :func:`HCGB.sampleParser` - :func:`HCGB.functions.aesthetics_functions` - :func:`HCGB.functions.time_functions` - :func:`HCGB.functions.main_functions` - :func:`HCGB.functions.file_functions` """ ## help_format option if (options.help_format): help_info.help_fastq_format() exit() HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Preparing samples") print ("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = os.path.abspath(options.output_folder) ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## Project mode as default project_mode=True if (options.detached): options.project = False project_mode=False else: options.project = True ## output folder print ("\n+ Create output folder(s):") HCGB_files.create_folder(outdir) ### info final_dir = "" if (options.project): print ("+ Generate a directory containing information within the project folder provided") final_dir = HCGB_files.create_subfolder("info", outdir) else: final_dir = outdir ## get files pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) ## Information returned in pd_samples_retrieved ### sample, dirname, name, name_len, lane, read_pair, lane_file, ext, gz if options.debug: HCGB_aes.debug_message("pd_samples_retrieved", "yellow") HCGB_main.print_all_pandaDF(pd_samples_retrieved) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## check character limitation list_lengths = pd_samples_retrieved.loc[:,'name_len'].to_list() if any(i > 10 for i in list_lengths): print (colored("\t ** Name lengths exceeds the 10 character limitation...", 'yellow')) if not (options.rename): print (colored("** ERROR: Rename files or provide --rename option...", 'red')) exit() ### rename files if (options.rename): options.rename = os.path.abspath(options.rename) if not HCGB_files.is_non_zero_file(options.rename): print (colored("** ERROR: File provided with rename information is not readable.", 'red')) print (options.rename) exit() names_retrieved = pd.read_csv(options.rename, sep=',', index_col=0, squeeze=True, header=None).to_dict() ## read csv to dictionary if (options.debug): HCGB_aes.debug_message("names_retrieved", "yellow") print (names_retrieved) ## TODO: check integrity of new names and special characters ## print to a file timestamp = time_functions.create_human_timestamp() rename_details = final_dir + '/' + timestamp + '_prep_renameDetails.txt' rename_details_hd = open(rename_details, 'w') ## rename files for index, row in pd_samples_retrieved.iterrows(): if (row['gz']): extension_string = row['ext'] + row['gz'] else: extension_string = row['ext'] if options.single_end: renamed = names_retrieved[row['name']] + '.' + extension_string else: renamed = names_retrieved[row['name']] + '_' + row['read_pair'] + '.' + extension_string ## modify frame pd_samples_retrieved.loc[index, 'new_file'] = renamed pd_samples_retrieved.loc[index, 'new_name'] = names_retrieved[row['name']] ## save in file string = row['sample'] + '\t' + renamed + '\n' rename_details_hd.write(string) if (options.debug): print (colored('** DEBUG: rename', 'yellow')) print ("Original: ", row['name']) print ("Renamed: ", names_retrieved[row['name']]) print ("File:", renamed) rename_details_hd.close() ##elif (options.single_end): It should work for both print ("+ Sample files have been renamed...") else: pd_samples_retrieved['new_file'] = pd_samples_retrieved['file'] ## create outdir for each sample outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "raw", options.debug) ## merge option if (options.merge): print ("+ Sample files will be merged...") ## TODO: check when rename option provided pd_samples_merged = sampleParser.merge.one_file_per_sample( pd_samples_retrieved, outdir_dict, options.threads, final_dir, options.debug) if (options.rename): print ("+ Merge files have been renamed...") else: print ("+ Sample files have been merged...") ## process is finished here print ("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print ("+ Exiting prep module.") exit() ## debugging messages if (options.debug): print (colored("** DEBUG: pd_samples_retrieved", 'yellow')) HCGB_main.print_all_pandaDF(pd_samples_retrieved) print (colored("** DEBUG: outdir_dict", 'yellow')) print (outdir_dict) ## copy or create symbolic link for files if (options.copy): print ("+ Sample files will be copied...") ## print to a file timestamp = HCGB_time.create_human_timestamp() copy_details = final_dir + '/' + timestamp + '_prep_copyDetails.txt' copy_details_hd = open(copy_details, 'w') else: print ("+ Sample files will be linked...") list_reads = [] for index, row in pd_samples_retrieved.iterrows(): if (options.copy): ## TODO: debug & set threads to copy faster shutil.copy(row['sample'], os.path.join(outdir_dict[row['new_name']], row['new_file'] )) string = row['sample'] + '\t' + os.path.join(outdir_dict[row['new_name']], row['new_file']) + '\n' copy_details_hd.write(string) else: list_reads.append(row['new_file']) if options.project: HCGB_files.get_symbolic_link_file(row['sample'], os.path.join(outdir_dict[row['new_name']], row['new_file'])) if (options.copy): print ("+ Sample files have been copied...") copy_details_hd.close() else: if not options.project: HCGB_files.get_symbolic_link(list_reads, outdir) print ("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print ("+ Exiting prep module.") return()