Beispiel #1
0
def parse_annot_file(name,
                     folder_out_input,
                     annot_file,
                     output_path,
                     Debug,
                     ref_file=""):
    """
    This functions checks for each annotation file provided type of input
    and calls appropriate parser: gbf_parser or gff_parser
    """
    ## debug messages
    if (Debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('check_annot_file function call:', 'yellow')
        debug_message('name: ' + name, 'yellow')
        debug_message('annot_file: ' + annot_file, 'yellow')

    ## check file integrity: exists & non-zero
    if (BacDup_functions.file_readable_check(annot_file)):
        ## check format; call parser
        format = format_checker.is_format(annot_file, Debug)

        ## debug messages
        if (Debug):
            debug_message('\nformat_checker.is_format function call:',
                          'yellow')
            debug_message('format: ' + format, 'yellow')

        ## parse gbk or gff
        if (format == 'gbk'):
            print(colored('\t* GenBank format file:........[OK]', 'green'))

            ## TODO: print details available within GenBank:
            # Accession, Bioproject,
            # Reference, Authors, Title, Journal,
            # Comment

            return (gbf_parser.gbf_parser_caller(annot_file, output_path,
                                                 Debug))

        elif (format == 'gff'):
            print(colored('\t* GFF format file:.......[OK]', 'green'))
            if (HCGB_files.is_non_zero_file(ref_file)):
                return (gff_parser.gff_parser_caller(annot_file, ref_file,
                                                     output_path, Debug))
            else:
                print(
                    colored(
                        "ERROR: No genome reference file provided for this GFF annotation. Check input options provided.",
                        "red"))
                exit()

        ## not valid via this option
        else:
            print(colored("ERROR: not valid via this option", "red"))
            exit()

    ## not accessible for this sample
    else:
        return (False)
Beispiel #2
0
def main():
	
	## ARGV
	if len (sys.argv) < 6:
		print ("\nUsage:")
		print ("python3 %s bam_file folder gtf_file threads name featureCount_bin multimapping[True/False]\n" %os.path.realpath(__file__))
		exit()
	
	bam_file = os.path.abspath(argv[1])
	folder = os.path.abspath(argv[2])
	gtf_file = os.path.abspath(argv[3])
	threads = argv[4]
	name = argv[5]
	featureCount_exe = argv[6]
	multimapping= argv[7]

	## Debug
	Debug=True
	
	## variables
	biotype_all(featureCount_exe, folder, gtf_file, bam_file, name, threads, Debug, multimapping)
	## plot results
	RNAbiotypes_stats_file = os.path.join(folder, name + '_RNAbiotype.tsv')
	if files_functions.is_non_zero_file(RNAbiotypes_stats_file):
		pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug)
Beispiel #3
0
def R_package_path_installed():
    """Provides absolute path to file ``R_package.info.txt`` containing path to missing R packages installed"""

    ## check if exists or try to install
    RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'R', 'R_package.info.txt')

    if HCGB_files.is_non_zero_file(RDir_package):
        list = HCGB_main.readList_fromFile(RDir_package)
        return (list[0])
    else:
        path2install = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'R',
            'install_packages')
        HCGB_files.create_folder(path2install)
        return (path2install)
Beispiel #4
0
def RNAbiotype_module_call(samples_dict, output_dict, gtf_file, Debug,
                           max_workers_int, threads_job):
    """
	Create RNAbiotype analysis for each sample and create summary plots
	
	:param samples_dict: Dictionary containing sample IDs as keys and bam files as values
	:param output_dict: Dictionary containing sample IDs as keys and output folder as values
	:param gtf_file: Gene annotation file for the reference genome used.
	:param threads: Number of threads to use.
	:param Debug: True/False for debugging messages
	"""

    ## get bin
    featureCount_exe = set_config.get_exe('featureCounts')

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(biotype_all, featureCount_exe, output_dict[sample],
                            gtf_file, bam_files, sample, threads_job, Debug):
            sample
            for sample, bam_files in samples_dict.items()
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ##
    ## plot results
    for name, folder in output_dict.items():
        RNAbiotypes_stats_file = os.path.join(folder, name + '_RNAbiotype.tsv')
        if files_functions.is_non_zero_file(RNAbiotypes_stats_file):
            pie_plot_results(RNAbiotypes_stats_file, name, folder, Debug)

    return ()
Beispiel #5
0
def parse_featureCount(out_file, path, name, bam_file, Debug):
	"""
	Parses featureCount results for RNAbiotype analysis.
	
	:param out_file: Name provided to featureCount for output results.
	:param path:
	:param name:
	
	
	"""

	## file names
	out_tsv_file_name = out_file + '.tsv'
	RNA_biotypes_file_name = os.path.join(path, name + '_RNAbiotype.tsv')

	##
	filename_stamp_parse = path + '/.success_parse'
	if os.path.isfile(filename_stamp_parse):
		stamp = time_functions.read_time_stamp(filename_stamp_parse)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'parse results'), 'yellow'))
	else:
	
		## debugging messages
		if Debug:
			print ("** DEBUG:")
			print ("Parse results for sample: " + name)
			
		## parse results
		out_tsv_file = open(out_tsv_file_name, 'w')
		RNA_biotypes_file = open(RNA_biotypes_file_name, 'w')
		tRNA_count = 0
		
		##########################################
		### read count file
		##########################################
		count_file = open(out_file)
		count_file_text = count_file.read()
		count_file_lines = count_file_text.splitlines()	
	
		for line in count_file_lines:
			if line.startswith('#'):
				continue
			elif line.startswith('Geneid'):
				continue
			else:
				ID = line.split('\t')[0]
				count = int(line.split('\t')[-1])
				string2write_raw = "%s\t%s\n" %(ID, count)
				out_tsv_file.write(string2write_raw)
	
				tRNA_search = re.search(r".*tRNA", ID)
				if tRNA_search:
					tRNA_count = int(tRNA_count) + int(count)				
				elif (count > 0):
					RNA_biotypes_file.write(string2write_raw)
		
		## count and summary tRNA
		string2write = "tRNA\t%s\n" %tRNA_count
		RNA_biotypes_file.write(string2write)
		RNA_biotypes_file.close()
				
		##########################################
		### read summary count file
		##########################################
		summary_count_file = open(out_file + '.summary')
		summary_count_file_text = summary_count_file.read()
		summary_count_file_lines = summary_count_file_text.splitlines()	
	
		for line in summary_count_file_lines:
			if line.startswith('Status'):
				continue
			elif line.startswith('Assigned'):
				continue
			else:
				## adds Unassigned_Ambiguity
				## adds Unassigned_NoFeatures
				ID = line.split('\t')[0]
				count = int(line.split('\t')[-1])
	
				## skip empty entries
				if count == 0:
					continue
				string2write_raw = "%s\t%s\n" %(ID, count)
				out_tsv_file.write(string2write_raw)
	
		##########################################
		## get mapping statistics according to mapping software
		##########################################
		count_multi = 0
		count_unmap = 0
		mapping_folder = os.path.dirname(bam_file)
		mapping_stats = mapping_folder + '/Log.final.out'
		
		## -------------------------------- ##
		### STAR mapping		
		## -------------------------------- ##
		if files_functions.is_non_zero_file(mapping_stats):
			## debugging messages
			if Debug:
				print ("** DEBUG:")
				print ("STAR mapping available for sample: " + name)
				print ("mapping_folder: " + mapping_folder)
	
			mapping_stats_file = open(mapping_stats)
			mapping_stats_file_text = mapping_stats_file.read()
			mapping_stats_file_lines = mapping_stats_file_text.splitlines()	
	
			for line in mapping_stats_file_lines:
				multi_search = re.search(r".*Number of reads mapped to", line)
				unmap_search = re.search(r".*unmapped.*", line)
				input_search = re.search(r".*input reads.*", line)
			
				if input_search:
					total_input_reads = int(line.split('\t')[-1])
	
				if multi_search:
					count_tmp = int(line.split('\t')[-1])
					count_multi = count_multi + count_tmp
	
				elif unmap_search:
					perc_tmp = line.split('\t')[-1]
					count_reads = math_functions.percentage(perc_tmp, total_input_reads)
					count_unmap = count_unmap + count_reads
		else:
	
			## -------------------------------- ##
			## tophat
			## -------------------------------- ##
	
			mapping_stats = mapping_folder + '/align_summary.txt' 
			count_map = 0
			total_input_reads = 0
			
			if files_functions.is_non_zero_file(mapping_stats):
				## debugging messages
				if Debug:
					print ("** DEBUG:")
					print ("tophat mapping available for sample: " + name)
					print ("mapping_folder: " + mapping_folder)
				
				mapping_stats_file = open(mapping_stats)
				mapping_stats_file_text = mapping_stats_file.read()
				mapping_stats_file_lines = mapping_stats_file_text.splitlines()	
	
				for line in mapping_stats_file_lines:
					map_search2 = re.search(r"Aligned.*\:\s+(\d+).*", line)
					input_search2 = re.search(r".*Input.*\:\s+(\d+).*", line)
					if input_search2:
						total_input_reads = input_search2.group(1)
					if map_search2:
						count_map = map_search2.group(1)
		
				####
				count_unmap = int(total_input_reads) - int(count_map)
	
			else:
				## other
				print ("Neither tophat or STAR..., no mapping statistics")
	
		### print mapping stats
		string2write_unmap = "unmapped\t%s\n" %count_unmap
		out_tsv_file.write(string2write_unmap)
		
		## close files
		out_tsv_file.close()

		## print timestamp
		time_functions.print_time_stamp(filename_stamp_parse)

	return(out_tsv_file_name, RNA_biotypes_file_name)
def results_parser(database, folderResults, sampleName, outfolder,
                   assembly_cutoff, card_trick_info):
    """Parse ARIBA results
	
	This function basically extracts files and generated additionally information for later
	parse according to type of database provided.
	
	.. seealso:: Additional information to ARIBA results generated.
	
		- :ref:`ARIBA-explained`
	"""
    if not os.path.exists(folderResults):
        print(
            "+ Finish parsing information for sample [%s]. Results folder does not exist."
            % sampleName)
        return ('NaN', 'NaN')

    ## get files
    list_files = os.listdir(folderResults)

    ## init
    assemblies = ""
    assemled_genes = ""
    fileResults = ""

    print("\n+ Parsing result file for sample: ", sampleName)

    ## extract files
    print("\n+ Extracting files if necessary:")
    for f in list_files:
        filePath = os.path.join(folderResults, f)
        if f.endswith('.gz'):
            HCGB_files.extract(filePath, folderResults)
        if (f == 'report.tsv'):
            fileResults = filePath
        elif (f == 'assemblies.fa.gz'):
            assemblies = os.path.join(folderResults, 'assemblies.fa')
        elif (f == 'assembled_genes.fa.gz'):
            assemled_genes = os.path.join(folderResults, 'assembled_genes.fa')
    print("\n")

    ## no results generated
    if not HCGB_files.is_non_zero_file(fileResults):
        print('+ No results generated for sample: ', sampleName)
        return ('', '')

    ### expand flags
    flagResults = folderResults + '/flags_explain.tsv'
    fileFlags = ariba_caller.ariba_expandflag(fileResults, flagResults)

    ######################
    ## generate summary
    ######################
    ##
    ## ariba has function that generates a summary for samples
    ##
    summary_results_tmp = folderResults + '/report_summary_tmp'
    summary_results = folderResults + '/report_summary.csv'
    options = "--no_tree"
    ## Info
    ## https://github.com/sanger-pathogens/ariba/wiki/The-assembled-column-from-ariba-summary

    ariba_caller.ariba_summary(summary_results_tmp, [fileResults], options)

    ## fix names: just for aesthetics
    fake_dict = {sampleName: fileResults}
    ariba_caller.fix_ariba_summary(summary_results_tmp + '.csv',
                                   summary_results, fake_dict)
    os.remove(summary_results_tmp + '.csv')

    ############################################
    ### check results according to database
    ############################################
    if (database == 'vfdb_full'):
        (name_excel, name_csv) = parse_vfdb(outfolder, sampleName, fileResults,
                                            fileFlags, summary_results,
                                            assembly_cutoff)
    elif (database == 'card'):
        (name_excel, name_csv) = parse_card(outfolder, sampleName, fileResults,
                                            fileFlags, summary_results,
                                            assembly_cutoff, card_trick_info)
    else:
        ## [TODO] check results according to databases different than CARD/VFDB
        (name_excel, name_csv) = parse_results(outfolder, sampleName,
                                               fileResults, fileFlags,
                                               summary_results)

    print('\tCheck additional information on ', name_excel)

    ## print success timestamp
    filename_stamp = outfolder + '/.success_' + database
    stamp = HCGB_time.print_time_stamp(filename_stamp)

    return (name_excel, name_csv)
Beispiel #7
0
def agrvate_call(sample, assembly_file, folder, debug=False):
    """agrvate call and check results."""
    
    ## prepare call
    log_call = os.path.join(folder, "agrvate_cmd.log")
    err_call = os.path.join(folder, "agrvate_cmd.err")
    agrvate_bin = set_config.get_exe('agrvate')
    
    ## system call
    cmd_call = "%s -i %s -m -f >  %s 2> %s " %(agrvate_bin, 
                                               assembly_file,
                                               log_call, err_call) ## use mummer (-m) and force results folder (-f)
    status = HCGB_sys.system_call(cmd_call)
    
    ## check results
    ## see https://github.com/VishnuRaghuram94/AgrVATE#results for additional details
    results = pd.DataFrame()
    
    ## check folder is created
    assembly_file_name = os.path.basename(assembly_file).split('.fna')[0]    
    original_results_folder = os.path.join(folder, assembly_file_name + '-results')
    results_folder = os.path.join(folder, 'agrvate_results')
    
    if os.path.isdir(original_results_folder):
        print("+ Results folder generated OK")
        print("+ Check results generated:")
        
        ## rename folder
        os.rename(original_results_folder, results_folder)
        os.rename(os.path.join(folder, assembly_file_name + '.fna-error-report.tab'), os.path.join(results_folder, 'error_report.tab'))
        
        ## write to excel
        file_name_Excel = os.path.join(folder, sample + '_agr_results.xlsx')
        writer_Excel = pd.ExcelWriter(file_name_Excel, engine='xlsxwriter') ## open excel handle
    
        ## get all files
        list_files = HCGB_main.get_fullpath_list(results_folder)
    
        ## summary tab
        summary_tab_file = [s for s in list_files if s.endswith("summary.tab")][0]
        summary_tab =  HCGB_main.get_data(summary_tab_file, '\t', options="")
        summary_tab['sample'] = sample
        
        ## columns
        #agr_group: gp1/gp2/gp3/gp4. 'u' means unknown. 
        ##           If multiple agr groups were found (col 5 = m), 
        ##           the displayed agr group is the majority/highest confidence. 
        # match_score: maximum 15; 0 means untypeable; < 5 means low confidence.
        # canonical_agrD: 1 means canonical; 0 means non-canonical; u means unknown.
        # multiple_agr:  s means single, m means multiple, u means unknown ) 
        ##               Multiple groups are found likely due to multiple S. aureus isolates in sequence
        # frameshifts: Number found in CDS of extracted agr operon ('u' if agr operon not extracted)
        
        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: Summary tab file", 'yellow')
            print(summary_tab_file)
            print(summary_tab)

        ## add summary results to all results
        del summary_tab['#filename']
        results = summary_tab.copy()

        ## save summary_tab into excel
        ## tab summary
        summary_tab.to_excel(writer_Excel, sheet_name='summary') ## write excel handle

        ## agr_gp tab
        agr_gp_tab_file = [s for s in list_files if s.endswith("agr_gp.tab")][0]
        if HCGB_files.is_non_zero_file(agr_gp_tab_file):
            agr_gp_tab =  HCGB_main.get_data(agr_gp_tab_file, '\t', options='header=None')
            agr_gp_tab.columns = ['contig', 'agr', 'evalue', 'identity', 'start', 'end']
            agr_gp_tab['sample'] = sample
            
            ## columns
            ## Assembly Contig ID
            ## ID of matched agr group kmer
            ## evalue
            ## Percentage identity of match
            ## Start position of kmer alignment on input sequence
            ## End position of kmer alignment on input sequence
    
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_gp file", 'yellow')
                print(agr_gp_tab_file)
                print(agr_gp_tab)
            
            ## save agr_gp_tab file into excel
            ## tab operon
            agr_gp_tab.to_excel(writer_Excel, sheet_name='operon') ## write excel handle

        ## agr_operon fna
        try:
            agr_operon_fna_file = [s for s in list_files if s.endswith("agr_operon.fna")][0]
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_operon file", 'yellow')
                print(agr_operon_fna_file)
            
            results['operon_fna'] = agr_operon_fna_file
        except:
            results['operon_fna'] = ''

        ## agr_operon fna
        error_report_file = [s for s in list_files if s.endswith("error_report.tab")][0]
        error_report =  HCGB_main.get_data(error_report_file, '\t', options="")
        del error_report['#input_name']

        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: error_report.tab file", 'yellow')
            print(error_report_file)
            print(error_report)
            
        ## save error_report file into excel
        ## tab steps
        error_report.to_excel(writer_Excel, sheet_name='steps') ## write excel handle
        
        ## merge results
        results = pd.concat([results, error_report], axis=1)

        ## close xlsx file
        writer_Excel.save() ## close excel handle
    
        ## add to pandas dataframe
        results['agr_operon_xlsx'] = file_name_Excel

    ## debug messages
    if debug:
        HCGB_aes.debug_message("agrvate results", 'yellow')
        HCGB_main.print_all_pandaDF(results)
        
    return (results)
Beispiel #8
0
def get_reference_gbk(options):

   ####################
    ## Genbank_ID
    ####################
    reference_gbk_file = ""
    if options.Genbank_ID:
        db_frame_ncbi = database_generator.getdbs('NCBI', options.database, 'genbank', options.debug)
    
        ## debug message
        if (options.debug):
            print (colored("**DEBUG: db_frame_ncbi **", 'yellow'))
            print (db_frame_ncbi) 

        NCBI_folder = HCGB_files.create_subfolder('NCBI', options.database)
        dir_path = os.path.join(NCBI_folder, 'genbank', 'bacteria', options.Genbank_ID)    
        if (options.Genbank_ID in db_frame_ncbi.index): 
            print('\t+ Reference (%s) available in database provided' %options.Genbank_ID)
        else:
            print ('\t+ Reference (%s) is not available in database provided' %options.Genbank_ID)
            print ('\t+ Try to download it.')
            database_generator.ngd_download(dir_path, options.Genbank_ID, NCBI_folder)
    
        ## get files download
        (genome, prot, gff, gbk) = database_generator.get_files_download(dir_path)
        if options.debug:
                print (colored("**DEBUG: genome:" + genome, 'yellow'))
                print (colored("**DEBUG: prot:" + prot, 'yellow'))
                print (colored("**DEBUG: gff:" + gff, 'yellow'))
                print (colored("**DEBUG: gbk:" + gbk, 'yellow'))
                
        if HCGB_files.is_non_zero_file(gbk):
            print('\t+ Genbank file format reference available.')
            reference_gbk_file = gbk
        else:
            print(colored('\n+ No genbank file available for the reference specified. Some error occurred while downloading', 'red'))
            exit()
            
    ####################          
    ## user_sample_ID
    ####################
    elif options.user_sample_ID:
        db_frame_user_Data = database_user.get_userData_files(options, os.path.join(options.database, 'user_data'))
        df_data = db_frame_user_Data.groupby('name')

        try:
            this_sample_df = df_data.get_group(options.project_sample_ID)
            print('\t+ Reference (%s) available in database folder provided' %options.user_sample_ID)
        except:
            print (colored('** WARNING: Reference (%s) not available in database folder provided' %options.user_sample_ID, 'yellow'))
            print ('\t+ Lets try to update the database first.')
            db_frame_user_dataUpdated = database_user.update_database_user_data(options.database, input_dir, options.debug, options)
            df_data = db_frame_user_dataUpdated.groupby('name')
 
            try:
                this_sample_df = df_data.get_group(options.user_sample_ID)
                print('\t+ Reference (%s) available in database updated' %options.user_sample_ID)
                db_frame_user_Data = db_frame_user_dataUpated

            except:
                print(colored('\n** ERROR: No reference (%s) available in database updated. Some error occurred...' %options.user_sample_ID, 'red'))
                exit()

         ## debug message
        if (options.debug):
            print (colored("**DEBUG: db_frame_user_Data **", 'yellow'))
            print (db_frame_user_Data)
            print (colored("**DEBUG: this_sample_df (groupby name)**", 'yellow'))
            print (this_sample_df)


       ## get gbk file
        gbk = this_sample_df.loc[ this_sample_df['ext']=='gbf','sample'].values[0]
        
        ## debug
        if options.debug:
            print ("** DEBUG: this_sample_df")
            print (this_sample_df)
            print ('gbk:' + gbk)
  
        ## check if exists
        if HCGB_files.is_non_zero_file(gbk):
            print('\t+ Genbank file format reference available.')
            reference_gbk_file = gbk
        else:
            print(colored('\n** ERROR: No genbank file available for the reference specified. Some error occurred while downloading', 'red'))
            exit()
        
    ####################    
    ## project_sample_ID
    ####################
    elif options.project_sample_ID:
        
        db_frame_project_Data = database_user.get_userData_files(options, options.input)
        df_data = db_frame_project_Data.groupby('name')

        try:
            this_sample_df = df_data.get_group(options.project_sample_ID)
            print('\t+ Reference (%s) available in project folder provided' %options.project_sample_ID)
        except:
            print (colored('** ERROR: Reference (%s) not available in project folder provided' %options.project_sample_ID, 'red'))
            print ('\t+ Check the spelling or provide a valid ID.')
            exit()
 
        ## debug message
        if (options.debug):
            print (colored("**DEBUG: db_frame_project_Data **", 'yellow'))
            print (db_frame_project_Data)
            print (colored("**DEBUG: this_sample_df (groupby name)**", 'yellow'))
            print (this_sample_df)

        ## get gbk file
        gbk = this_sample_df.loc[ this_sample_df['ext']=='gbf','sample'].values[0]

        ## debug
        if options.debug:
            print ("** DEBUG: this_sample_df")
            print (this_sample_df)
            print ('gbk:' + gbk)

        ## check if exists
        if HCGB_files.is_non_zero_file(gbk):
            print('\t+ Genbank file format reference available.')
            reference_gbk_file = gbk
        else:
            print(colored('\n** ERROR: No genbank file available for the reference specified. Some error occurred while downloading', 'red'))
            exit()

    ####################
    ## user_ref
    ####################
    elif options.user_ref:
        options.user_ref = os.path.abspath(options.user_ref)
        if HCGB_files.is_non_zero_file(options.user_ref):
            print('\t+ Reference provided via --user_ref is available and ready to use.')
        else:
            print('\n** ERROR: Reference provided via --user_ref not available or accessible.')
            print(colored('\n+ Check the path or integrity of the file. Some error occurred...', 'red'))
            exit()
        reference_gbk_file = options.user_ref

    
    return (reference_gbk_file)
Beispiel #9
0
def parse_information(arg_dict, df_accID, outdir):

    ### Parse df_accID
    dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project,
                                                   df_accID, "input",
                                                   arg_dict.debug)
    dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project,
                                                   df_accID, "parse",
                                                   arg_dict.debug)

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        print("dict_input_folders")
        print(dict_input_folders)
        print("dict_parse_folders")
        print(dict_parse_folders)

    ## parse each sample retrieved
    for sample, folder_input in dict_input_folders.items():

        if (arg_dict.debug):
            debug_message('sample: ' + sample, 'yellow')
            debug_message('folder_input: ' + folder_input, 'yellow')
            debug_message('folder_parse: ' + dict_parse_folders[sample],
                          'yellow')
            debug_message('annot_file: ' + df_accID.loc[sample, 'annot_file'],
                          'yellow')
            debug_message('genome' + df_accID.loc[sample, 'genome'], 'yellow')

        ## timestamps
        input_timestamp = os.path.join(folder_input, '.success')
        parse_timestamp = os.path.join(dict_parse_folders[sample], '.success')

        print()
        print("\t+ Parsing sample: " + sample)

        if (not HCGB_files.is_non_zero_file(parse_timestamp)
                and not HCGB_files.is_non_zero_file(input_timestamp)):

            ## TODO: Set threads to use in parallel
            process_OK = parse_annot_file(sample, folder_input,
                                          df_accID.loc[sample, 'annot_file'],
                                          dict_parse_folders[sample],
                                          arg_dict.debug,
                                          df_accID.loc[sample, 'genome'])

            if (process_OK):

                ## link or copy annotation file into folder_input
                HCGB_files.get_symbolic_link_file(
                    df_accID.loc[sample, 'annot_file'], folder_input)

                ## add df_accID.loc[sample,] information as csv into input folder
                df_accID.loc[sample, ].to_csv(os.path.join(
                    folder_input, 'info.csv'),
                                              index=True,
                                              header=True)

                ## print time stamp
                HCGB_time.print_time_stamp(input_timestamp)

                ## print time stamp
                HCGB_time.print_time_stamp(parse_timestamp)
            else:
                print(
                    colored(
                        "\t+ Some error occurred for sample %s while parsing input options"
                        % sample, 'red'))

                ## print time stamp
                HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail'))

                ## print time stamp
                HCGB_time.print_time_stamp(
                    os.path.join(dict_parse_folders[sample], '.fail'))
        else:
            read_time = HCGB_time.read_time_stamp(parse_timestamp)
            print(
                colored(
                    "\t+ Input parsing already available for sample %s [%s]" %
                    (sample, read_time), 'green'))
            print()
Beispiel #10
0
def run_biotype(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_XICRA.help_fastq_format()
    elif (options.help_project):
        ## information for project
        help_XICRA.project_help()
        exit()
    elif (options.help_RNAbiotype):
        ## information for join reads
        RNAbiotype.help_info()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    aesthetics_functions.pipeline_header('XICRA')
    aesthetics_functions.boxymcboxface("RNA biotype analysis")
    print("--------- Starting Process ---------")
    time_functions.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    if (options.detached):
        outdir = os.path.abspath(options.output_folder)
        options.project = False
    else:
        options.project = True
        outdir = input_dir

    ## get files
    print('+ Getting files from input folder... ')

    ## get files
    if options.noTrim:
        print('+ Mode: fastq.\n+ Extension: ')
        print("[ fastq, fq, fastq.gz, fq.gz ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
            options.debug)

    else:
        print('+ Mode: trim.\n+ Extension: ')
        print("[ _trim_ ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)

        ## Discard if joined reads: use trimmed single-end or paired-end
        pd_samples_retrieved = pd_samples_retrieved[
            pd_samples_retrieved['ext'] != '_joined']

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        files_functions.create_folder(outdir)

    ## for samples
    mapping_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "map", options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_outdir_dict **", 'yellow'))
        print(mapping_outdir_dict)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_total)

    ## optimize threads
    name_list = set(pd_samples_retrieved["new_name"].tolist())
    threads_job = main_functions.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ##############################################
    ## map Reads
    ##############################################
    start_time_partial = mapReads_module(options, pd_samples_retrieved,
                                         mapping_outdir_dict, options.debug,
                                         max_workers_int, threads_job,
                                         start_time_partial, outdir)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_results **", 'yellow'))
        print(mapping_results)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    ## for samples
    biotype_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "biotype",
        options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: biotype_outdir_dict **", 'yellow'))
        print(biotype_outdir_dict)

    ## get RNAbiotype information
    RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict,
                                      options.annotation, options.debug,
                                      max_workers_int, threads_job)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print(
            "\n+ Generating a report using MultiQC module for featureCount analysis."
        )
        outdir_report = files_functions.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders:"
        )

        ## call multiQC report module
        givenList = [v for v in biotype_outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            print(
                colored("\n**DEBUG: my_outdir_list for multiqc report **",
                        'yellow'))
            print(my_outdir_list)
            print("\n")

        featureCount_report = files_functions.create_subfolder(
            "featureCount", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "featureCount",
                                           featureCount_report, "-dd 2")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % featureCount_report)

        ### Summarizing RNA biotype information
        biotype_report = files_functions.create_subfolder(
            "biotype", outdir_report)
        single_files_biotype = files_functions.create_subfolder(
            "samples", biotype_report)

        ## results
        dict_files = {}

        for samples in biotype_outdir_dict:
            featurecount_file = os.path.join(biotype_outdir_dict[samples],
                                             'featureCount.out.tsv')
            if files_functions.is_non_zero_file(featurecount_file):
                dict_files[samples] = featurecount_file
            ## copy pdf
            pdf_plot = main_functions.retrieve_matching_files(
                biotype_outdir_dict[samples], '.pdf', options.debug)
            if files_functions.is_non_zero_file(pdf_plot[0]):
                shutil.copy(pdf_plot[0], single_files_biotype)

        ## collapse all information
        all_data = RNAbiotype.generate_matrix(dict_files)

        ## print into excel/csv
        print('+ Table contains: ', len(all_data), ' entries\n')

        ## debugging messages
        if Debug:
            print("** DEBUG: all_data")
            print(all_data)

        ## set abs_csv_outfile to be in report folder
        ## copy or link files for each sample analyzed
        abs_csv_outfile = os.path.join(biotype_report, "summary.csv")
        all_data.to_csv(abs_csv_outfile)

        ## create plot: call R [TODO: implement in python]
        outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf")

        ## R scripts
        biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum',
                                           options.debug)
        rscript = set_config.get_exe("Rscript", options.debug)
        cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script,
                                            abs_csv_outfile, outfile_pdf)

        ##
        print("+ Create summary plot for all samples")
        callCode = system_call_functions.system_call(cmd_R_plot)

    print("\n*************** Finish *******************")
    start_time_partial = time_functions.timestamp(start_time_total)
    print("\n+ Exiting join module.")
    return ()
Beispiel #11
0
def run_prep(options):
	"""
	Main function of the prep module.
	
	This module prepares fastq files for later usage. It initially checks the length
	of the name and advises the user to rename samples if exceeded. Along ``BacterialTyper`` 
	there are a few string length limitations by different software that need to be sort
	out from the beginning of the process.
	
	This module allows to user to copy files into the project folder initiate or only link using
	a symbolic link to avoid duplicated raw data. 
	
	See additional details of this module in user_guide :ref:`prep module entry<prep-description>`. 

	
	.. seealso:: This function depends on other HCGB functions called:
	
		- :func:`HCGB.sampleParser`
		
		- :func:`HCGB.functions.aesthetics_functions`
		
		- :func:`HCGB.functions.time_functions`
	
		- :func:`HCGB.functions.main_functions`
		
		- :func:`HCGB.functions.file_functions`
		
	"""
	
	## help_format option
	if (options.help_format):
		help_info.help_fastq_format()
		exit()
		
	HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
	HCGB_aes.boxymcboxface("Preparing samples")
	print ("--------- Starting Process ---------")
	HCGB_time.print_time()
	
	## init time
	start_time_total = time.time()
	
	## absolute path for in & out
	input_dir = os.path.abspath(options.input)
	outdir = os.path.abspath(options.output_folder)

	### set as default paired_end mode
	if (options.single_end):
		options.pair = False
	else:
		options.pair = True

	## Project mode as default
	project_mode=True
	if (options.detached):
		options.project = False
		project_mode=False
	else:
		options.project = True

	## output folder	
	print ("\n+ Create output folder(s):")
	HCGB_files.create_folder(outdir)

	### info
	final_dir = ""
	if (options.project):
		print ("+ Generate a directory containing information within the project folder provided")
		final_dir = HCGB_files.create_subfolder("info", outdir)
	else:
		final_dir = outdir
	
	## get files
	pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug)
		
	## Information returned in pd_samples_retrieved
	### sample, dirname, name, name_len, lane, read_pair, lane_file, ext, gz
	
	if options.debug:
		HCGB_aes.debug_message("pd_samples_retrieved", "yellow")
		HCGB_main.print_all_pandaDF(pd_samples_retrieved)
	
	## time stamp
	start_time_partial = HCGB_time.timestamp(start_time_total)
	
	## check character limitation
	list_lengths = pd_samples_retrieved.loc[:,'name_len'].to_list()
	if any(i > 10 for i in list_lengths):
		print (colored("\t ** Name lengths exceeds the 10 character limitation...", 'yellow'))
		if not (options.rename):
			print (colored("** ERROR: Rename files or provide --rename option...", 'red'))
			exit()

	### rename files 
	if (options.rename):
		options.rename = os.path.abspath(options.rename)
		if not HCGB_files.is_non_zero_file(options.rename):
			print (colored("** ERROR: File provided with rename information is not readable.", 'red'))
			print (options.rename)
			exit()
		
		names_retrieved = pd.read_csv(options.rename, sep=',', 
									index_col=0, squeeze=True, 
									header=None).to_dict() ## read csv to dictionary
		if (options.debug):
			HCGB_aes.debug_message("names_retrieved", "yellow")
			print (names_retrieved)
			
		## TODO: check integrity of new names and special characters
	
		## print to a file
		timestamp = time_functions.create_human_timestamp()
		rename_details = final_dir + '/' + timestamp + '_prep_renameDetails.txt'
		rename_details_hd = open(rename_details, 'w')
	
		## rename files 		
		for index, row in pd_samples_retrieved.iterrows():
			if (row['gz']):
				extension_string = row['ext'] + row['gz']
			else:
				extension_string = row['ext']
			
			if options.single_end:
				renamed = names_retrieved[row['name']] + '.' + extension_string
			else:
				renamed = names_retrieved[row['name']] + '_' + row['read_pair'] + '.' + extension_string
			
			## modify frame
			pd_samples_retrieved.loc[index, 'new_file'] = renamed
			pd_samples_retrieved.loc[index, 'new_name'] = names_retrieved[row['name']]
			## save in file
			string = row['sample'] + '\t' + renamed + '\n'
			rename_details_hd.write(string)
			
			if (options.debug):
				print (colored('** DEBUG: rename', 'yellow'))
				print ("Original: ", row['name'])
				print ("Renamed: ", names_retrieved[row['name']])
				print ("File:", renamed)
		
		rename_details_hd.close()	

		##elif (options.single_end): It should work for both
		print ("+ Sample files have been renamed...")
	else:
		pd_samples_retrieved['new_file'] = pd_samples_retrieved['file']

	## create outdir for each sample
	outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "raw", options.debug)	
		
	## merge option
	if (options.merge):
		print ("+ Sample files will be merged...")
		## TODO: check when rename option provided
		pd_samples_merged = sampleParser.merge.one_file_per_sample(
			pd_samples_retrieved, outdir_dict, options.threads,	
			final_dir, options.debug)
		
		if (options.rename):
			print ("+ Merge files have been renamed...")
		else:
			print ("+ Sample files have been merged...")
		
		## process is finished here
		print ("\n*************** Finish *******************")
		start_time_partial = HCGB_time.timestamp(start_time_total)
	
		print ("+ Exiting prep module.")
		exit()
	
	## debugging messages
	if (options.debug):
		print (colored("** DEBUG: pd_samples_retrieved", 'yellow'))
		HCGB_main.print_all_pandaDF(pd_samples_retrieved)
		print (colored("** DEBUG: outdir_dict", 'yellow'))
		print (outdir_dict)
	
	## copy or create symbolic link for files
	if (options.copy):
		print ("+ Sample files will be copied...")
		## print to a file
		timestamp = HCGB_time.create_human_timestamp()
		copy_details = final_dir + '/' + timestamp + '_prep_copyDetails.txt'
		copy_details_hd = open(copy_details, 'w')
	else:
		print ("+ Sample files will be linked...")	
	
	list_reads = []
	for index, row in pd_samples_retrieved.iterrows():
		if (options.copy):
		    ## TODO: debug & set threads to copy faster
		    shutil.copy(row['sample'], os.path.join(outdir_dict[row['new_name']], row['new_file'] ))            
		    string = row['sample'] + '\t' + os.path.join(outdir_dict[row['new_name']], row['new_file']) + '\n'
		    copy_details_hd.write(string)            
		else:
		    list_reads.append(row['new_file'])
		    
		    if options.project:
		    	
		        HCGB_files.get_symbolic_link_file(row['sample'], 
		                                         os.path.join(outdir_dict[row['new_name']], row['new_file']))

	if (options.copy):
		print ("+ Sample files have been copied...")
		copy_details_hd.close()
	else:
		if not options.project:
			HCGB_files.get_symbolic_link(list_reads, outdir)
	
	print ("\n*************** Finish *******************")
	start_time_partial = HCGB_time.timestamp(start_time_total)

	print ("+ Exiting prep module.")
	return()