Ejemplo n.º 1
0
def module_call(db_folder, dictionary_fasta_files, debug):
    """
    
    """

    files_functions.create_folder(db_folder)
    if db_folder.endswith("spaTyper"):
        spaTyper_db = db_folder
    else:
        spaTyper_db = os.path.join(db_folder, "spaTyper")
        files_functions.create_folder(spaTyper_db)

    ## check if files are available
    (spaTyper_repeats, spaTyper_types) = check_files(spaTyper_db, debug)

    ## Get the SpaTypes in fasta sequences
    seqDict, letDict, typeDict, seqLengths = spaTyper.spa_typing.getSpaTypes(
        spaTyper_repeats, spaTyper_types, debug)

    ## debug messages
    if debug:
        print(
            '## Debug: seqDict: Too large to print: See repeat_file for details'
        )
        print(
            '## Debug: typeDict: Too large to print: See repeat_order_file for details'
        )
        print('## Debug: letDict: conversion dictionary')
        print(letDict)
        print('## Debug: seqLengths:')
        print(seqLengths)

    ## summary results
    results_summary = pd.DataFrame(columns=("sample", "sequence", "Repeats",
                                            "Repeat Type"))

    ## for each sample get spaType
    for key, value in dictionary_fasta_files.items():
        print("+ Sample: ", key)
        returned_value = call_spaTyper(value, seqDict, letDict, typeDict,
                                       seqLengths, debug)

        if len(returned_value.keys()) > 1:
            print(
                colored(
                    "** Attention: >1 spaTypes detected for sample: %s" % key,
                    'red'))

        for j in returned_value.keys():
            splitted = returned_value[j].split('::')

            results_summary.loc[len(results_summary)] = (key, j, splitted[2],
                                                         splitted[1])
            ## debug messages
            if debug:
                print("Sequence name: ", j, "Repeats:", splitted[2],
                      "Repeat Type:", splitted[1], '\n')

    ##
    return (results_summary)
Ejemplo n.º 2
0
def run_module_SPADES_old(name, folder, file1, file2, threads):

	print ("+ Calling spades assembly for sample...", name)	

	## folder create
	HCGB_files.create_folder(folder)
	
	## get configuration
	SPADES_bin = set_config.get_exe('spades')
	
	## assembly main 
	path_to_contigs = run_SPADES_assembly(folder, file1, file2, name, SPADES_bin, threads)

	## assembly plasmids
	path_to_plasmids = run_SPADES_plasmid_assembly(folder, file1, file2, name, SPADES_bin, threads)
	
	## discard plasmids from main
	(tmp_contigs, tmp_plasmids) = discardPlasmids(path_to_contigs, path_to_plasmids, folder, name)
	
	## rename fasta sequences
	new_contigs = tmp_contigs.split(".fna.tmp")[0] + '.fna'	
	rename_contigs(tmp_contigs, "scaffolds_chr", new_contigs)
	
	new_plasmids=""
	if os.path.isfile(tmp_plasmids):
		new_plasmids = tmp_plasmids.split(".fna.tmp")[0] + '.fna'	
		rename_contigs(tmp_plasmids, "scaffolds_plasmids", new_plasmids)
	
	## contig stats
	stats(new_contigs, new_plasmids)
	
	## success stamps
	filename_stamp = folder + '/.success'
	stamp =	HCGB_time.print_time_stamp(filename_stamp)
Ejemplo n.º 3
0
def ariba_pubmlstget(species, outdir):
	######################################################################################
	## usage: ariba pubmlstget [options] <"species in quotes"> <output_directory>
	######################################################################################	
	## Download typing scheme for a given species from PubMLST, and make an ARIBA db
	## positional arguments:
	##  species     Species to download. Put it in quotes
	##	outdir      Name of output directory to be made (must not already exist)
	######################################################################################
	
	## download information in database folder provided by config
	print ("+ Call ariba module 'pubmlstget' to retrieve MLST information.")
	HCGB_files.create_folder(outdir)
	cmd = 'ariba pubmlstget "%s" %s' %(species, outdir)
	return(HCGB_sys.system_call(cmd))
Ejemplo n.º 4
0
def R_package_path_installed():
    """Provides absolute path to file ``R_package.info.txt`` containing path to missing R packages installed"""

    ## check if exists or try to install
    RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'R', 'R_package.info.txt')

    if HCGB_files.is_non_zero_file(RDir_package):
        list = HCGB_main.readList_fromFile(RDir_package)
        return (list[0])
    else:
        path2install = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'R',
            'install_packages')
        HCGB_files.create_folder(path2install)
        return (path2install)
def download_VFDB_files(folder):
    ##
    ## Given a folder, check if it contains VFDB information
    ## or download it from website: http://www.mgc.ac.cn
    ##
    links = (
        "http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz",
        "http://www.mgc.ac.cn/VFs/Down/Comparative_tables_from_VFDB.tar.gz")

    ## check if data is downloaded, how old is the data and if it is necessary to download again
    ## consider >30 days long enough to be updated again

    ## time stamp
    filename_stamp = folder + '/download_timestamp.txt'
    if os.path.exists(folder):
        if os.path.isfile(filename_stamp):
            stamp = HCGB_time.read_time_stamp(filename_stamp)
            print("+ A previous download generated results on: ", stamp)
            days_passed = HCGB_time.get_diff_time(filename_stamp)
            print("\t\t** %s days ago" % days_passed)
            if (days_passed > 30):  ## download again
                print(
                    "\t\t** Downloading information again just to be sure...")
            else:
                print("\t\t** No need to download data again.")
                return ()
    else:
        HCGB_files.create_folder(folder)

    ## Open file and readlines
    print('+ Downloading files:\n')
    for line in links:
        if not line.startswith('#'):
            HCGB_sys.wget_download(line, folder)

    ## decompress files
    print('+ Decompressing gzip files\n')
    files = os.listdir(folder)
    for item in files:
        #print (folder)
        if item.endswith('.gz'):
            HCGB_files.extract(folder + '/' + item, folder)

    ## make stamp time
    HCGB_time.print_time_stamp(filename_stamp)

    return ()
Ejemplo n.º 6
0
def install_R_packages(package, source, install_path, extra):
	
	(install_R, install_github_package) = get_install_R_files()
	
	HCGB_files.create_folder(install_path)
	Rscript_exe = set_config.get_exe('Rscript')
	print("+ Installing %s package..." %package)
	install_file = install_R
	if (source == 'github'):
		install_file = install_github_package
		package= extra + '/' + package
	
	cmd_R = '%s %s -l %s -p %s' %(Rscript_exe, install_file, package, install_path)
	HCGB_sys.system_call(cmd_R)
	
	## check if exists or try to install
	MLSTar_package = os.path.join(install_path, 'MLSTar')
	if os.path.exists(MLSTar_package):
		RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'R', 'R_package.info.txt')
		HCGB_main.printList2file(RDir_package, [install_path])
	else:
		print_error_message(package, "No R package found", 'package')
		print ('Please install manually to proceed...')
Ejemplo n.º 7
0
def main():

    ## control if options provided or help
    if len(sys.argv) > 1:
        print ("")
    else:
        help_options()
        exit()
        
    name = argv[1]
    fasta_file = os.path.abspath(argv[2])
    folder = os.path.abspath(argv[3])
    debug=True

    ## path
    folder = HCGB_files.create_folder(folder)
    ## ATTENTION: agrvate needs to chdir to output folder
    os.chdir(folder)

    ###
    agrvate_call(name, fasta_file, folder, debug)
Ejemplo n.º 8
0
def agrvate_caller(dict_assemblies, dict_folders, debug=False):
    """Create agrvate call and control for parameters"""
    
    ## ATTENTION: agrvate needs to chdir to output folder
    path_here = os.getcwd()
    
    print ("+ Checking agr genes for each sample retrieved...")
    
    agrvate_results = pd.DataFrame()
    
    ## No need to optimize. There is a problem with the working dir of agrvate and we 
    ## need to change every time.
    for name, assembly_file in dict_assemblies.items():
        sample_folder = HCGB_files.create_folder(dict_folders[name])
        ## check if previously done and succeeded
        filename_stamp = sample_folder + '/.success'
        if os.path.isfile(filename_stamp):
            stamp =  HCGB_time.read_time_stamp(filename_stamp)
            print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
        else:
            os.chdir(sample_folder)
            info_sample = agrvate_call(name, assembly_file, sample_folder, debug)
            agrvate_results = pd.concat([agrvate_results, info_sample], join='outer')
            
            if (info_sample.shape[0] == 0):
                print("+ Some error occurred with sample %s. Please re-run analysis or check log files." %name)
            else:
                ## success
                HCGB_time.print_time_stamp(filename_stamp)
    
    print ("+ Jobs finished%s\n+ Collecting information for all samples...")
    os.chdir(path_here)
    
    ## debug messages
    if debug:
        HCGB_aes.debug_message('agrvate_results', 'yellow')
        HCGB_main.print_all_pandaDF(agrvate_results)
    
    return(agrvate_results)
def index_database(fileToIndex, kma_bin, index_name, option, folder, type_option):
	"""
	Calls KMA_ software to index fasta files into a database for later KMA identification.
	
	:param fileToIndex: Fasta file to include in the database.
	:param kma_bin: Absolute path to kma executable binary. 
	:param index_name: Name for the database.
	:param option: Option to create or update a database.
	:param folder: Absolute path to folder containing database.
	:param type_option: Option to index the database with batch: batch [Default: Off].
	
	:type fileToIndex: string
	:type kma_bin: string 
	:type index_name: string 
	:type option: string 
	:type folder: string 
	:type type_option: string 	 
	
	:returns: It returns message from :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed`.
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.create_folder`
		
		- :func:`BacterialTyper.scripts.functions.system_call`
		
		- :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed`
	"""	
	
	########################################################################################
	## 								KMA_index-1.2.2							
	########################################################################################
	# kma_index creates the databases needed to run KMA, from a list of fasta files given.
	# Options are:		
	#				Desc:									Default:
	#
	#	-i			Input/query file name (STDIN: "--")		None
	#	-o			Output file								Input/template file
	#	-batch		Batch input file
	#	-deCon		File with contamination (STDIN: "--")	None/False
	#	-batchD		Batch decon file
	#	-t_db		Add to existing DB						None/False
	#	-k			Kmersize								16
	#	-k_t		Kmersize for template identification	16
	#	-k_i		Kmersize for indexing					16
	#	-ML			Minimum length of templates	kmersize 	(16)	
	#	-CS			Start Chain size						1 M
	#	-ME			Mega DB									False
	#	-NI			Do not dump *.index.b					False
	#	-Sparse		Make Sparse DB ('-' for no prefix)		None/False
	#	-ht			Homology template						1.0
	#	-hq			Homology query							1.0
	#	-and		Both homolgy thresholds
	#				has to be reached						or
	#	-v			Version
	#	-h			Shows this help message
	#######################################################################################
	
	## check if file exists
	if os.path.isfile(index_name):
		index_file_name = index_name
	else:
		index_file_name = folder + '/' + index_name
		
	logFile = index_file_name + '.log'
	
	## check if folder exists
	HCGB_files.create_folder(folder)

	## single file
	if (type_option == 'batch'):
		type_option = '-batch'
	else:
		type_option = '-i'
	
	## new or add to existing db
	if (option == "new"):
		print ("\n+ Generate and index database for kmer alignment search...\n")
		cmd_kma_index = "%s index %s %s -o %s 2> %s" %(kma_bin, type_option, fileToIndex, index_file_name, logFile)
	elif (option == "add"):
		print ("\n+ Updating database with new entries...\n")
		cmd_kma_index = "%s index %s %s -o %s -t_db %s 2> %s" %(kma_bin, type_option, fileToIndex, index_file_name, index_file_name, logFile)

	code = HCGB_sys.system_call(cmd_kma_index)	
	if code == 'FAIL':
		print (colored("Database generated an error during the index: %s" %index_name, 'red'))
		print (colored("EXIT", 'red'))
		exit()
		
	return_code = check_db_indexed(index_file_name, folder)
	return(return_code)
Ejemplo n.º 10
0
def BUSCO_check(input_dir, outdir, options, start_time_total, mode):

    HCGB_aes.boxymcboxface("BUSCO Analysis Quality check")

    ## absolute path for in & out
    database_folder = os.path.abspath(options.database)

    ## get files and get dir for each sample according to mode
    if mode == 'genome':
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "assembly", ["fna"], options.debug)

        if not options.project:
            outdir = HCGB_files.create_subfolder("assembly_qc", outdir)

        if options.debug:
            print("** DEBUG: pd_samples_retrieved")
            print(pd_samples_retrieved)

        BUSCO_outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                                      pd_samples_retrieved,
                                                      "assemble_qc",
                                                      options.debug)

    elif mode == 'proteins':
        pd_samples_retrieved = sampleParser.files.get_files(
            options, outdir, "annot", ["faa"], options.debug)  ##

        if not options.project:
            outdir = HCGB_files.create_subfolder("annot_qc", outdir)

        if options.debug:
            print("** DEBUG: pd_samples_retrieved")
            print(pd_samples_retrieved)

        BUSCO_outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                                      pd_samples_retrieved,
                                                      "annot_qc",
                                                      options.debug)

    ## add column to dataframe
    pd_samples_retrieved['busco_folder'] = ""
    for index, row in pd_samples_retrieved.iterrows():
        pd_samples_retrieved.at[index, 'busco_folder'] = BUSCO_outdir_dict[
            row['name']]

    ## debug message
    if (options.debug):
        HCGB_aes.debug_message("df_samples_busco", 'yellow')
        print(pd_samples_retrieved)

        HCGB_aes.debug_message("BUSCO_outdir_dict", 'yellow')
        print(BUSCO_outdir_dict)

    ## Check each using BUSCO
    database_folder = os.path.abspath(options.database)
    BUSCO_Database = HCGB_files.create_subfolder('BUSCO', database_folder)
    if not os.path.exists(BUSCO_Database):
        HCGB_files.create_folder(BUSCO_Database)

    ## call
    (dataFrame_results, stats_results) = BUSCO_caller.BUSCO_call(
        options.BUSCO_dbs, pd_samples_retrieved, BUSCO_Database,
        options.threads, mode)

    ## debug message
    if (options.debug):
        HCGB_aes.debug_message("dataFrame_results", 'yellow')
        HCGB_main.print_all_pandaDF(dataFrame_results)

    ## functions.timestamp
    print("+ Quality control of all samples finished: ")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## multiqc report plot
    if (options.skip_report):
        print("+ No report generation...")
    else:
        print("\n+ Generating a report BUSCO plot.")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders."
        )

        ## name folder according to mode
        if mode == 'genome':
            BUSCO_report = HCGB_files.create_subfolder("BUSCO_assembly",
                                                       outdir_report)
        elif mode == 'proteins':
            BUSCO_report = HCGB_files.create_subfolder("BUSCO_annot",
                                                       outdir_report)

        ## generate plots
        print("+ Generate summarizing plots...")
        BUSCO_caller.BUSCO_plots(dataFrame_results, BUSCO_report,
                                 options.threads)
        print('\n+ Check quality plots in folder: %s' % BUSCO_report)

        ##	TODO
        ##	Parse BUSCO statistics in dataframe (stats_results) for discarding samples if necessary
        ##	given a cutoff, discard or advise to discard some samples

        ### print statistics
        stats_results.to_csv(BUSCO_report + "/BUSCO_stats.csv")
        name_excel = BUSCO_report + "/BUSCO_stats.xlsx"
        writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')
        stats_results.to_excel(writer, sheet_name="BUSCO statistics")
        writer.save()

        print('\n+ Check quality statistics in folder: %s' % BUSCO_report)

    return (dataFrame_results)
Ejemplo n.º 11
0
def run_phylo(options):
    """
    Main function acting as an entry point to the module *phylo*.
    """

    ##################################
    ### show help messages if desired    
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    
    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False
        
    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Phylogenetic reconstruction")

    print ("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir=""

    ## set mode: project/detached
    ## Project mode as default
    project_mode=True
    if (options.detached):
        options.project = False
        project_mode=False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir    
    
    ## get the database 
    options.database = os.path.abspath(options.database)
    
    ### parse the reference
    print ("+ Retrieve the reference...")
    reference_gbk_file = get_reference_gbk(options)
                 
    ## generate output folder, if necessary
    print ("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    
    ##################################
    ## select samples and map    
    ####################################
    print ("+ Retrieve samples to map available...")
    dict_folders = map_samples(options, reference_gbk_file, input_dir, outdir)
    
    if Debug:
        print (colored("**DEBUG: dict_folders **", 'yellow'))
        print (dict_folders)
    
    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ##################################
    ## Create core alingment
    ##################################
    outdir_report = HCGB_files.create_subfolder("report", outdir)
    phylo_dir = HCGB_files.create_subfolder("phylo", outdir_report)
    analysis_dir = HCGB_files.create_subfolder(options.name, phylo_dir)
    snippy_dir = HCGB_files.create_subfolder("snippy", analysis_dir)
        
    list_folders = list(dict_folders.values())
    options_string = ""
    variant_calling.snippy_core_call(list_folders, options_string, options.name, 
                                     snippy_dir, options.output_format, Debug)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## snp distance matrix
    snp_distance_dir = HCGB_files.create_subfolder("snp_distance", analysis_dir)
    name_matrix = os.path.join(snp_distance_dir, "snp_matrix_" + options.name)
    
    countGaps = False
    aln_file = os.path.join(snippy_dir, options.name + '.aln')
    phylo_parser.get_snp_distance(aln_file, options.output_format, countGaps, name_matrix, Debug)
    
    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## phylogenetic analysis
    iqtree_output = HCGB_files.create_subfolder("iqtree", analysis_dir)
    phylo_parser.ml_tree(snippy_dir, options.name, options.threads, iqtree_output, Debug)
    
    ## time stamp
    start_time_partial = HCGB_files.timestamp(start_time_total)

    print ("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print ("+ Exiting Annotation module.")
    return()
Ejemplo n.º 12
0
def run_prep(options):
	"""
	Main function of the prep module.
	
	This module prepares fastq files for later usage. It initially checks the length
	of the name and advises the user to rename samples if exceeded. Along ``BacterialTyper`` 
	there are a few string length limitations by different software that need to be sort
	out from the beginning of the process.
	
	This module allows to user to copy files into the project folder initiate or only link using
	a symbolic link to avoid duplicated raw data. 
	
	See additional details of this module in user_guide :ref:`prep module entry<prep-description>`. 

	
	.. seealso:: This function depends on other HCGB functions called:
	
		- :func:`HCGB.sampleParser`
		
		- :func:`HCGB.functions.aesthetics_functions`
		
		- :func:`HCGB.functions.time_functions`
	
		- :func:`HCGB.functions.main_functions`
		
		- :func:`HCGB.functions.file_functions`
		
	"""
	
	## help_format option
	if (options.help_format):
		help_info.help_fastq_format()
		exit()
		
	HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
	HCGB_aes.boxymcboxface("Preparing samples")
	print ("--------- Starting Process ---------")
	HCGB_time.print_time()
	
	## init time
	start_time_total = time.time()
	
	## absolute path for in & out
	input_dir = os.path.abspath(options.input)
	outdir = os.path.abspath(options.output_folder)

	### set as default paired_end mode
	if (options.single_end):
		options.pair = False
	else:
		options.pair = True

	## Project mode as default
	project_mode=True
	if (options.detached):
		options.project = False
		project_mode=False
	else:
		options.project = True

	## output folder	
	print ("\n+ Create output folder(s):")
	HCGB_files.create_folder(outdir)

	### info
	final_dir = ""
	if (options.project):
		print ("+ Generate a directory containing information within the project folder provided")
		final_dir = HCGB_files.create_subfolder("info", outdir)
	else:
		final_dir = outdir
	
	## get files
	pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug)
		
	## Information returned in pd_samples_retrieved
	### sample, dirname, name, name_len, lane, read_pair, lane_file, ext, gz
	
	if options.debug:
		HCGB_aes.debug_message("pd_samples_retrieved", "yellow")
		HCGB_main.print_all_pandaDF(pd_samples_retrieved)
	
	## time stamp
	start_time_partial = HCGB_time.timestamp(start_time_total)
	
	## check character limitation
	list_lengths = pd_samples_retrieved.loc[:,'name_len'].to_list()
	if any(i > 10 for i in list_lengths):
		print (colored("\t ** Name lengths exceeds the 10 character limitation...", 'yellow'))
		if not (options.rename):
			print (colored("** ERROR: Rename files or provide --rename option...", 'red'))
			exit()

	### rename files 
	if (options.rename):
		options.rename = os.path.abspath(options.rename)
		if not HCGB_files.is_non_zero_file(options.rename):
			print (colored("** ERROR: File provided with rename information is not readable.", 'red'))
			print (options.rename)
			exit()
		
		names_retrieved = pd.read_csv(options.rename, sep=',', 
									index_col=0, squeeze=True, 
									header=None).to_dict() ## read csv to dictionary
		if (options.debug):
			HCGB_aes.debug_message("names_retrieved", "yellow")
			print (names_retrieved)
			
		## TODO: check integrity of new names and special characters
	
		## print to a file
		timestamp = time_functions.create_human_timestamp()
		rename_details = final_dir + '/' + timestamp + '_prep_renameDetails.txt'
		rename_details_hd = open(rename_details, 'w')
	
		## rename files 		
		for index, row in pd_samples_retrieved.iterrows():
			if (row['gz']):
				extension_string = row['ext'] + row['gz']
			else:
				extension_string = row['ext']
			
			if options.single_end:
				renamed = names_retrieved[row['name']] + '.' + extension_string
			else:
				renamed = names_retrieved[row['name']] + '_' + row['read_pair'] + '.' + extension_string
			
			## modify frame
			pd_samples_retrieved.loc[index, 'new_file'] = renamed
			pd_samples_retrieved.loc[index, 'new_name'] = names_retrieved[row['name']]
			## save in file
			string = row['sample'] + '\t' + renamed + '\n'
			rename_details_hd.write(string)
			
			if (options.debug):
				print (colored('** DEBUG: rename', 'yellow'))
				print ("Original: ", row['name'])
				print ("Renamed: ", names_retrieved[row['name']])
				print ("File:", renamed)
		
		rename_details_hd.close()	

		##elif (options.single_end): It should work for both
		print ("+ Sample files have been renamed...")
	else:
		pd_samples_retrieved['new_file'] = pd_samples_retrieved['file']

	## create outdir for each sample
	outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "raw", options.debug)	
		
	## merge option
	if (options.merge):
		print ("+ Sample files will be merged...")
		## TODO: check when rename option provided
		pd_samples_merged = sampleParser.merge.one_file_per_sample(
			pd_samples_retrieved, outdir_dict, options.threads,	
			final_dir, options.debug)
		
		if (options.rename):
			print ("+ Merge files have been renamed...")
		else:
			print ("+ Sample files have been merged...")
		
		## process is finished here
		print ("\n*************** Finish *******************")
		start_time_partial = HCGB_time.timestamp(start_time_total)
	
		print ("+ Exiting prep module.")
		exit()
	
	## debugging messages
	if (options.debug):
		print (colored("** DEBUG: pd_samples_retrieved", 'yellow'))
		HCGB_main.print_all_pandaDF(pd_samples_retrieved)
		print (colored("** DEBUG: outdir_dict", 'yellow'))
		print (outdir_dict)
	
	## copy or create symbolic link for files
	if (options.copy):
		print ("+ Sample files will be copied...")
		## print to a file
		timestamp = HCGB_time.create_human_timestamp()
		copy_details = final_dir + '/' + timestamp + '_prep_copyDetails.txt'
		copy_details_hd = open(copy_details, 'w')
	else:
		print ("+ Sample files will be linked...")	
	
	list_reads = []
	for index, row in pd_samples_retrieved.iterrows():
		if (options.copy):
		    ## TODO: debug & set threads to copy faster
		    shutil.copy(row['sample'], os.path.join(outdir_dict[row['new_name']], row['new_file'] ))            
		    string = row['sample'] + '\t' + os.path.join(outdir_dict[row['new_name']], row['new_file']) + '\n'
		    copy_details_hd.write(string)            
		else:
		    list_reads.append(row['new_file'])
		    
		    if options.project:
		    	
		        HCGB_files.get_symbolic_link_file(row['sample'], 
		                                         os.path.join(outdir_dict[row['new_name']], row['new_file']))

	if (options.copy):
		print ("+ Sample files have been copied...")
		copy_details_hd.close()
	else:
		if not options.project:
			HCGB_files.get_symbolic_link(list_reads, outdir)
	
	print ("\n*************** Finish *******************")
	start_time_partial = HCGB_time.timestamp(start_time_total)

	print ("+ Exiting prep module.")
	return()
Ejemplo n.º 13
0
def run(options):
    """
	This is the main function of the module ``config``. It basically checks 
	if the different requirements (python` and third-party software) are
	fulfilled. 

	If any requirement is not available this modules tries to install them or reports to the user to
	manually install them.

	:param option: State whether to check or install missing modules, packages and third party software. Provide: check/install
	:param install_path: Absolute path to install modules or packages missing. Default: ``BacterialTyper`` environment path.
	:param IslandPath: True/False for checking additional perl and software required by this option analysis.
	:param debug: True/false for debugging messages.
	
	:type option: string 
	:type IslandPath: boolean
	:type install_path: string 
	:type debug: boolean	

	.. seealso:: This function depends on several ``BacterialTyper`` functions:

		- :func:`BacterialTyper.config.set_config.check_python_packages`

		- :func:`BacterialTyper.config.set_config.check_perl_packages`

		- :func:`BacterialTyper.config.extern_progs.return_min_version_soft`

		- :func:`BacterialTyper.config.extern_progs.print_dependencies`

	"""

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Pipeline Configuration")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    if (options.install_path):
        if os.path.isdir(options.install_path):
            if (Debug):
                print(
                    "Installation path provided for missing modules, packages, dependencies..."
                )
                print("Path: " + options.install_path)
        else:
            print(colored("\n*** ERROR ****", 'red'))
            print(colored("Path provided is not a folder", 'red'))
            print(options.install_path)
            exit()
    else:
        ## get python environment path
        env_bin_directory = os.path.dirname(os.environ['_'])

        ##os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'templates'))
        options.install_path = os.path.abspath(
            os.path.join(env_bin_directory, '../software'))

        if (Debug):
            print("Retrieve environment path as installation path:")
            print("Path: " + options.install_path)

        HCGB_files.create_folder(options.install_path)

    #######################
    ## install or only check
    #######################
    option_install = False
    if (options.option == 'install'):
        print("\n+ Check dependencies")
        print(
            "+ Try to install all missing dependencies, modules or third party software..."
        )
        option_install = True

        ## check if access and permission
        if os.path.isdir(options.install_path):
            if (set_config.access_check(options.install_path, mode=os.F_OK)):
                print(
                    "Installation path is accessible and has permission for installation if necessary"
                )
            else:
                print(colored("\n*** ERROR ****", 'red'))
                print(
                    colored(
                        "No access/permission for this path: %s" %
                        options.install_path, 'red'))
                print(
                    colored(
                        "Please provide a valid path with access/permission to install any missing dependencies.",
                        'red'))
                exit()
        else:
            print(colored("\n*** ERROR ****", 'red'))
            print(colored("Path provided is not a folder", 'red'))
            print(options.install_path)
            exit()

    elif (options.option == 'only_check'):
        print(
            "\nCheck dependencies, modules or third party software and print report..."
        )

    #######################
    ## python version
    #######################
    HCGB_aes.print_sepLine("+", 20, False)
    print('Python:')
    HCGB_aes.print_sepLine("+", 20, False)

    this_python_version = str(sys.version)
    python_min_version = extern_progs.return_min_version_soft('python')
    if LooseVersion(this_python_version) >= LooseVersion(python_min_version):
        print(
            colored(
                "Minimum version (%s) satisfied: %s" %
                (python_min_version, this_python_version), 'green'))
    else:
        print(
            colored(
                "Minimum version (%s) not satisfied: %s" %
                (python_min_version, this_python_version), 'red'))
        exit()

    #######################
    ## perl_version
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 50, False)
    print('Perl:')
    HCGB_aes.print_sepLine("+", 50, False)

    perl_min_version = extern_progs.return_min_version_soft('perl')
    this_perl_path = set_config.get_exe("perl", Debug)
    this_perl_version = set_config.get_version("perl", this_perl_path, Debug)
    if LooseVersion(this_perl_version) >= LooseVersion(perl_min_version):
        print(
            colored(
                "Minimum version (%s) satisfied: %s" %
                (perl_min_version, this_perl_version), 'green'))
    else:
        print(
            colored(
                "Minimum version (%s) not satisfied: %s" %
                (perl_min_version, this_perl_version), 'red'))
        exit()

    #######################
    ## third-party software
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('External dependencies:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_dependencies(option_install, options.install_path, Debug)
    print('\n')

    #######################
    ## python packages
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('Python packages:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_python_packages(Debug, option_install,
                                     options.install_path)
    HCGB_aes.print_sepLine("+", 20, False)
    print('\n')

    #######################
    ## perl packages
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('Perl packages:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_perl_packages("perl_dependencies", Debug, option_install,
                                   options.install_path)
    HCGB_aes.print_sepLine("+", 20, False)
    print('\n')

    #######################
    ## IslandPath dependencies
    #######################
    if (options.IslandPath):
        print('\n')
        HCGB_aes.print_sepLine("+", 20, False)
        print('IslandPath packages and software required:')
        HCGB_aes.print_sepLine("+", 20, False)

        set_config.check_IslandPath(Debug, option_install,
                                    options.install_path)
        HCGB_aes.print_sepLine("+", 20, False)
        print('\n')

    #######################
    ## R packages
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('R packages:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_R_packages(option_install, options.install_path, Debug)
    HCGB_aes.print_sepLine("+", 20, False)
    print('\n')
Ejemplo n.º 14
0
def biotype_all(featureCount_exe, path, gtf_file, bam_file, name, threads, Debug, allow_multimap, stranded):
	
	## folder for results
	if not os.path.isdir(path):
		files_functions.create_folder(path)

	out_file = os.path.join(path, 'featureCount.out')
	logfile = os.path.join(path, name + '_RNAbiotype.log')

	filename_stamp_all = path + '/.success_all'
	if os.path.isfile(filename_stamp_all):
		stamp = time_functions.read_time_stamp(filename_stamp_all)
		print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'RNAbiotype'), 'yellow'))
		return()

	else:
		filename_stamp_featureCounts = path + '/.success_featureCounts'
		if os.path.isfile(filename_stamp_featureCounts):
			stamp = time_functions.read_time_stamp(filename_stamp_featureCounts)
			print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'featureCounts'), 'yellow'))
		else:

			## debugging messages
			if Debug:
				print ("** DEBUG:")
				print ("featureCounts system call for sample: " + name)
				print ("out_file: " + out_file)
				print ("logfile: " + logfile)
		
			## send command for feature count
			## Allow multimapping
			if allow_multimap:
				cmd_featureCount = ('%s -s %s -M -O -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
			else:
				cmd_featureCount = ('%s -s %s --largestOverlap -T %s -p -t exon -g transcript_biotype -a %s -o %s %s 2> %s' %(
					featureCount_exe, stranded, threads, gtf_file, out_file, bam_file, logfile)
				)
				
				
			## system call
			cmd_featureCount_code = system_call_functions.system_call(cmd_featureCount, False, True)
			if not cmd_featureCount_code:
				print("** ERROR: featureCount failed for sample " + name)
				exit()
				
			## print time stamp
			time_functions.print_time_stamp(filename_stamp_featureCounts)
		
		## parse results
		(extended_Stats_file, RNAbiotypes_stats_file) = parse_featureCount(out_file, path, name, bam_file, Debug)
		
		## debugging messages
		if Debug:
			print ("** DEBUG:")
			print ("extended_Stats: " + extended_Stats_file)
			print (main_functions.get_data(extended_Stats_file, '\t', 'header=None'))
			print ("RNAbiotypes_stats: " + RNAbiotypes_stats_file)
			print (main_functions.get_data(RNAbiotypes_stats_file, '\t', 'header=None'))

	return ()
Ejemplo n.º 15
0
def run_profile(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    if (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    if (options.help_ARIBA):
        ## help_format option
        ariba_caller.help_ARIBA()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Virulence & Resistance profile module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    options.database = os.path.abspath(options.database)
    global input_dir
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    global Project
    if (options.detached):
        options.project = False
        outdir = os.path.abspath(options.output_folder)
        Project = False
    else:
        options.project = True
        outdir = input_dir
        Project = True

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "trim", ['_trim'], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    ## for each sample
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "profile",
                                            options.debug)

    ###
    print(
        "+ Generate a sample profile for virulence and resistance candidate genes for each sample retrieved using:"
    )
    print(
        "(1) Antimicrobial Resistance Inference By Assembly (ARIBA) software")
    print(
        "(2) Pre-defined databases by different suppliers or user-defined databases."
    )

    ## get databases to check
    retrieve_databases = get_options_db(options)

    ## functions.timestamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ########
    ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases,
                start_time_partial)

    ######################################
    ## update database for later usage
    ######################################
    if not options.fast:
        ## functions.timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

        HCGB_aes.boxymcboxface("Update Sample Database")

        ## update db
        print("+ Update database with samples identified")
        ## TODO: check if it works
        dataBase_user = database_user.update_database_user_data(
            options.database, input_dir, Debug, options)

        ## debug message
        if (Debug):
            print(colored("**DEBUG: results obtained **", 'yellow'))

    else:
        print(
            "+ No update of the database has been requested using option --fast"
        )

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Virulence & Resistance profile module.")
    return ()
Ejemplo n.º 16
0
def run_biotype(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_XICRA.help_fastq_format()
    elif (options.help_project):
        ## information for project
        help_XICRA.project_help()
        exit()
    elif (options.help_RNAbiotype):
        ## information for join reads
        RNAbiotype.help_info()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    aesthetics_functions.pipeline_header('XICRA')
    aesthetics_functions.boxymcboxface("RNA biotype analysis")
    print("--------- Starting Process ---------")
    time_functions.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    if (options.detached):
        outdir = os.path.abspath(options.output_folder)
        options.project = False
    else:
        options.project = True
        outdir = input_dir

    ## get files
    print('+ Getting files from input folder... ')

    ## get files
    if options.noTrim:
        print('+ Mode: fastq.\n+ Extension: ')
        print("[ fastq, fq, fastq.gz, fq.gz ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
            options.debug)

    else:
        print('+ Mode: trim.\n+ Extension: ')
        print("[ _trim_ ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)

        ## Discard if joined reads: use trimmed single-end or paired-end
        pd_samples_retrieved = pd_samples_retrieved[
            pd_samples_retrieved['ext'] != '_joined']

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        files_functions.create_folder(outdir)

    ## for samples
    mapping_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "map", options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_outdir_dict **", 'yellow'))
        print(mapping_outdir_dict)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_total)

    ## optimize threads
    name_list = set(pd_samples_retrieved["new_name"].tolist())
    threads_job = main_functions.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ##############################################
    ## map Reads
    ##############################################
    start_time_partial = mapReads_module(options, pd_samples_retrieved,
                                         mapping_outdir_dict, options.debug,
                                         max_workers_int, threads_job,
                                         start_time_partial, outdir)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_results **", 'yellow'))
        print(mapping_results)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    ## for samples
    biotype_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "biotype",
        options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: biotype_outdir_dict **", 'yellow'))
        print(biotype_outdir_dict)

    ## get RNAbiotype information
    RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict,
                                      options.annotation, options.debug,
                                      max_workers_int, threads_job)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print(
            "\n+ Generating a report using MultiQC module for featureCount analysis."
        )
        outdir_report = files_functions.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders:"
        )

        ## call multiQC report module
        givenList = [v for v in biotype_outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            print(
                colored("\n**DEBUG: my_outdir_list for multiqc report **",
                        'yellow'))
            print(my_outdir_list)
            print("\n")

        featureCount_report = files_functions.create_subfolder(
            "featureCount", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "featureCount",
                                           featureCount_report, "-dd 2")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % featureCount_report)

        ### Summarizing RNA biotype information
        biotype_report = files_functions.create_subfolder(
            "biotype", outdir_report)
        single_files_biotype = files_functions.create_subfolder(
            "samples", biotype_report)

        ## results
        dict_files = {}

        for samples in biotype_outdir_dict:
            featurecount_file = os.path.join(biotype_outdir_dict[samples],
                                             'featureCount.out.tsv')
            if files_functions.is_non_zero_file(featurecount_file):
                dict_files[samples] = featurecount_file
            ## copy pdf
            pdf_plot = main_functions.retrieve_matching_files(
                biotype_outdir_dict[samples], '.pdf', options.debug)
            if files_functions.is_non_zero_file(pdf_plot[0]):
                shutil.copy(pdf_plot[0], single_files_biotype)

        ## collapse all information
        all_data = RNAbiotype.generate_matrix(dict_files)

        ## print into excel/csv
        print('+ Table contains: ', len(all_data), ' entries\n')

        ## debugging messages
        if Debug:
            print("** DEBUG: all_data")
            print(all_data)

        ## set abs_csv_outfile to be in report folder
        ## copy or link files for each sample analyzed
        abs_csv_outfile = os.path.join(biotype_report, "summary.csv")
        all_data.to_csv(abs_csv_outfile)

        ## create plot: call R [TODO: implement in python]
        outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf")

        ## R scripts
        biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum',
                                           options.debug)
        rscript = set_config.get_exe("Rscript", options.debug)
        cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script,
                                            abs_csv_outfile, outfile_pdf)

        ##
        print("+ Create summary plot for all samples")
        callCode = system_call_functions.system_call(cmd_R_plot)

    print("\n*************** Finish *******************")
    start_time_partial = time_functions.timestamp(start_time_total)
    print("\n+ Exiting join module.")
    return ()
Ejemplo n.º 17
0
def run_assembly(options):
    """Main function of the assemble module.
	
	It assembles each sample using SPADES_ and checks quality using BUSCO_ software and database.

	
	.. seealso:: This function depends on other BacterialTyper and HCGB functions called:
	
		- :func:`BacterialTyper.scripts.BUSCO_caller.print_help_BUSCO`
	
		- :func:`BacterialTyper.scripts.multiQC_report.multiqc_help`
		
		- :func:`BacterialTyper.modules.qc.BUSCO_check`
			
		- :func:`HCGB.sampleParser`
		
		- :func:`HCGB.functions.aesthetics_functions`
		
		- :func:`HCGB.functions.time_functions`
	
		- :func:`HCGB.functions.main_functions`
		
		- :func:`HCGB.functions.file_functions`
		
	.. include:: ../../links.inc	 	
	
	"""

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_info.help_fastq_format()
        exit()
    elif (options.help_BUSCO):
        ## information for BUSCO
        BUSCO_caller.print_help_BUSCO()
        exit()
    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()
        exit()

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Assembly module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "trim", ['_trim'], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "assemble",
                                            options.debug)

    ### call assemble using spades
    start_time_partial = start_time_total
    start_time_partial_assembly = start_time_partial

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        HCGB_aes.debug_message("options.threads: " + str(options.threads),
                               "yellow")
        HCGB_aes.debug_message("max_workers: " + str(max_workers_int),
                               "yellow")
        HCGB_aes.debug_message("cpu_here: " + str(threads_job), "yellow")

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    # We can use a with statement to ensure threads are cleaned up promptly
    print('+ Running modules SPADES...')
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        ## send for each sample
        commandsSent = {
            executor.submit(check_sample_assembly, name, outdir_dict[name],
                            sorted(cluster["sample"].tolist()), threads_job):
            name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## functions.timestamp
    print("\n+ Assembly of all samples finished: ")
    start_time_partial = HCGB_time.timestamp(start_time_partial_assembly)

    ##
    if (assembly_stats):
        ###################
        if Debug:
            HCGB_aes.debug_message("assembly_stats dictionary", "yellow")
            print(assembly_stats)

        ## create single file
        get_assembly_stats_all(assembly_stats, outdir, Debug)

    ### symbolic links
    print("+ Retrieve all genomes assembled...")

    ### BUSCO check assembly
    if (options.no_BUSCO):
        print()
    else:
        results = qc.BUSCO_check(outdir, outdir, options, start_time_partial,
                                 "genome")

    ## print to file results
    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Assembly module.")
    return ()
Ejemplo n.º 18
0
def run_database(options):

    ## init time
    start_time_total = time.time()
    start_time_partial = start_time_total

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
        print("[Debug mode: ON]")
    else:
        Debug = False

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Database")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    kma_bin = set_config.get_exe("kma")

    ######################################################
    ## print further information if requested
    if (options.help_ARIBA):
        print("ARIBA databases information:")
        ariba_caller.help_ARIBA()
        exit()

    elif (options.help_BUSCO):
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_KMA):
        species_identification_KMA.help_kma_database()
        exit()
    ######################################################

    ## create folder
    ## absolute
    options.path = os.path.abspath(options.path)
    HCGB_files.create_folder(options.path)

    #########
    if Debug:
        print(colored("DEBUG: absolute path folder: " + options.path,
                      'yellow'))

    ##########
    ## NCBI	##
    ##########
    ## if any NCBI options provided
    if any([options.ID_file, options.descendant]):
        ## create folders
        NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path)
        if (options.ID_file):
            ## get path and check if it is file
            abs_path_file = os.path.abspath(options.ID_file)
            if os.path.isfile(abs_path_file):
                print()
                HCGB_aes.print_sepLine("*", 50, False)
                print("--------- Check NCBI ids provided ---------\n")
                HCGB_aes.print_sepLine("*", 70, False)
                ## get file information
                print("\t+ Obtaining information from file: %s" %
                      abs_path_file)
                strains2get = HCGB_main.get_data(abs_path_file, ',', '')
                dataBase_NCBI = database_generator.NCBI_DB(
                    strains2get, NCBI_folder, Debug)

                #########
                if Debug:
                    print(colored("DEBUG: NCBI data provided: ", 'yellow'))
                    print(options.ID_file)

                ## functions.timestamp
                start_time_partial = HCGB_time.timestamp(start_time_partial)
                ## strains downloaded would be included to a kma index

        ## Get all entries belonging to this taxon provided
        if (options.descendant):
            #########
            if Debug:
                print(colored("DEBUG: NCBI descendant option: ON ", 'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print(
                "--------- Check descendant NCBI taxonomy ids provided ---------\n"
            )
            HCGB_aes.print_sepLine("*", 70, False)
            ## [TODO]
            dataBase_NCBI = database_generator.NCBI_descendant(
                options.descendant, NCBI_folder, Debug)

        ##############################################################
        ## update KMA database with NCBI information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_NCBI['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db)

        print('+ Database to update: ', genbank_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'genbank_KMA',
                                               genbank_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ###############
    ## user_data ##
    ###############
    if options.project_folder:

        ##
        dataBase_user = pd.DataFrame()
        ## get absolute path
        abs_project_folder = os.path.abspath(options.project_folder)
        if os.path.exists(abs_project_folder):
            #########
            if Debug:
                print(
                    colored("DEBUG: User provides folder containing project",
                            'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print("--------- Check user provided project folder ---------")
            HCGB_aes.print_sepLine("*", 70, False)
            dataBase_user = database_user.update_database_user_data(
                options.path, abs_project_folder, Debug, options)
        else:
            print(
                colored(
                    "ERROR: Folder provided does not exists: %s" %
                    options.project_folder, 'red'))
            exit()

        ##############################################################
        ## update KMA database with user_data information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_user['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        user_kma_db = HCGB_files.create_subfolder('user_data', kma_db)

        print('+ Database to update: ', user_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'userData_KMA',
                                               user_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ##########
    ## ARIBA
    ##########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check ARIBA parameters provided --------")
    HCGB_aes.print_sepLine("*", 50, False)
    if (options.no_ARIBA):
        print("+ No ARIBA databases would be downloaded...")

        #########
        if Debug:
            print(colored("DEBUG: No option ARIBA", 'yellow'))

    else:
        #functions.print_sepLine("*",50, False)

        ### ariba list databases
        ariba_dbs_list = ['CARD', 'VFDB']

        if (options.no_def_ARIBA):
            ariba_dbs_list = options.ariba_dbs
        else:
            if (options.ariba_dbs):
                ariba_dbs_list = ariba_dbs_list + options.ariba_dbs
                ariba_dbs_list = set(ariba_dbs_list)

        #########
        if Debug:
            print(colored("DEBUG: Option ARIBA", 'yellow'))
            print(options.ariba_dbs)

        ariba_caller.download_ariba_databases(ariba_dbs_list, options.path,
                                              Debug, options.threads)

        ### ariba list databases
        if (options.ariba_users_fasta):
            print(
                "+ Generate ARIBA database for databases provided: prepare fasta and metadata information"
            )

            #########
            if Debug:
                print(colored("DEBUG: Option user ARIBA db", 'yellow'))
                print(ariba_users_fasta)
                print(ariba_users_meta)

            ## [TODO]:
            ## ariba prepareref fasta and metadata

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    #########
    ## kma ##
    #########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check KMA parameters provided ----------")
    kma_database = options.path + '/KMA_db'
    HCGB_files.create_folder(kma_database)

    ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains
    ## downloads all "bacterial" genomes from KMA website
    ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/

    print(
        "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website"
    )

    ## KMA databases to use
    ## only user dbs
    if (options.no_def_kma):
        if (options.kma_dbs):
            print("+ Only user databases selected will be indexed...")
        else:
            print("+ No databases selected.")
            print(colored("ERROR: Please select a kma database.", 'red'))
            exit()

    ## default dbs + user
    else:
        kma_dbs = ["bacteria", "plasmids"]

        ## default dbs + user
        if (options.kma_dbs):
            options.kma_dbs = options.kma_dbs + kma_dbs
            options.kma_dbs = set(options.kma_dbs)
        else:
            options.kma_dbs = kma_dbs

    #########
    if Debug:
        print(colored("DEBUG: options.kma_dbs", 'yellow'))
        print(options.kma_dbs)

    ## Get databases
    for db in options.kma_dbs:
        print(colored("\n+ " + db, 'yellow'))
        db_folder = HCGB_files.create_subfolder(db, kma_database)
        species_identification_KMA.download_kma_database(db_folder, db, Debug)

    ### timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ###########
    ## BUSCO ##
    ###########
    if (options.BUSCO_dbs):
        print()
        HCGB_aes.print_sepLine("*", 50, False)
        print("--------- Check BUSCO datasets provided ---------")
        BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path)

        #########
        if Debug:
            print(colored("DEBUG: options.BUSCO_dbs", 'yellow'))
            print(options.BUSCO_dbs)

        print("+ BUSCO datasets would be downloaded when executed...")
        #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder)

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    print("\n*************** Finish *******************\n")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Database module.\n")
    return ()
Ejemplo n.º 19
0
def sketch_database(dict_files, folder, Debug, ksize_n, num_sketch):	
	"""Sketch sequence files
	
	This function generates a sourmash index, also called sketch, of the sequences 
	provided in the folder specified.
	
	For speed reasons, we set force=True in add_sequence step to skip over k-mers containing 
	characters other than ACTG, rather than raising an exception.

	:param dict_files: keys are the names of the files and values are the path to the fasta file
	:param folder:
	:param Debug: True/False to print developing messages.
	:param ksize_n: Kmer size value.
	:param num_sketch: Number of sketches to include in the hash signature. 
	
	:type dict_files: Dictionary
	:type folder: string 
	:type Debug: bool
	:type ksize_n: integer
	:type num_sketch: integet
	
	:returns: List of SourmashSignature signatures (siglist) and absolute path files generated (siglist_file).  
	
	
	.. attention:: The code to implement this API function was taken and adapted from: 
	 
		- https://sourmash.readthedocs.io/en/latest/api-example.html
	
		- https://github.com/dib-lab/sourmash/blob/master/sourmash/commands.py
		
	
	.. seealso:: This function depends on sourmash python module (https://sourmash.readthedocs.io/en/latest/). Some functions employed are:
	
		- :func:`sourmash.MinHash`
		
		- :func:`sourmash.SourmashSignature`
		
		- :func:`sourmash.MinHash.add_sequence`
		
		
	.. include:: ../../links.inc	 
	
	"""
	### Default: set as option
	## num_sketch=5000
	## ksize_n=31
	
	minhashes = {}
	for name,g in dict_files.items():
		print ('\t+ Skecthing sample: ', name)
		E = sourmash.MinHash(n=num_sketch, ksize=ksize_n)	## generate hash according to number of sketches and kmer size
		for record in screed.open(g):
			E.add_sequence(record.sequence, True)
		## in add_sequence and for speed reasons, we set force=True to skip over k-mers containing characters other than ACTG, rather than raising an exception.
		minhashes[name]= E
		
	## Debug messages
	if Debug:
		print (colored("\n*** DEBUG: minhashes *****\n", 'red'))
		print (type(minhashes))	
		print (minhashes)

	siglist = []
	siglist_file = []

	### save as signature
	HCGB_files.create_folder(folder)
	for names,hashes in minhashes.items():
		sig1 = SourmashSignature(hashes, name=names)
		outfile_name = folder + '/' + str(names) + '.sig'
		with open(outfile_name, 'wt') as fp:
			save_signatures([sig1], fp)

		siglist_file.append(outfile_name)
		siglist.append(sig1)
	
	return(siglist_file, siglist)		
Ejemplo n.º 20
0
def run(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_info.help_fastq_format()
        exit()
    elif (options.help_trimm_adapters):
        ## help on trimm adapters
        trimmomatic_call.print_help_adapters()
        exit()
    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Trimming samples")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    if (options.detached):
        options.project = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
        options.debug)

    ## debug message
    if (Debug):
        HCGB_aes.debug_message("pd_samples_retrieved", 'yellow')
        HCGB_main.print_all_pandaDF(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "trimm",
                                            options.debug)

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print("+ Trimming adapters for each sample retrieved...")

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    # Trimming adapters
    if (options.adapters):
        # Adapter file provided
        options.adapters = os.path.abspath(options.adapters)
        print("\t- Adapters file provided...")
    else:
        # Get default adpaters file
        print("\t- Default Trimmomatic adapters (v0.39) will be used...")
        options.adapters = data_files.data_list(
            "available_Trimmomatic_adapters")

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(trimmo_caller, sorted(cluster["sample"].tolist()),
                            outdir_dict[name], name, threads_job, Debug,
                            options.adapters): name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    print("\n\n+ Trimming samples has finished...")
    ## functions.timestamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get files generated and generate symbolic link
    if not options.project:
        dir_symlinks = HCGB_files.create_subfolder('link_files', outdir)
        files2symbolic = []
        folders = os.listdir(outdir)

        ## debug message
        if (Debug):
            print(
                colored(
                    "**DEBUG: generate symbolic links for each file in " +
                    dir_symlinks + "**", 'yellow'))

        for fold in folders:
            if fold.endswith(".log"):
                continue
            else:
                this_folder = outdir + '/' + fold
                subfiles = os.listdir(this_folder)
                for files in subfiles:
                    files_search = re.search(
                        r".*trim_R\d{1}.*",
                        files)  ## only paired-end. Todo: single end
                    if files_search:
                        files2symbolic.append(this_folder + '/' + files)

        HCGB_files.get_symbolic_link(files2symbolic, dir_symlinks)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print("\n+ Generating a report using MultiQC module.")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        ## call multiQC report module
        givenList = [v for v in outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            HCGB_aes.debug_message("my_outdir_list for multiqc report",
                                   "yellow")
            print(my_outdir_list)
            print("\n")

        trimm_report = HCGB_files.create_subfolder("trimm", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "Trimmomatic",
                                           trimm_report, "")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % trimm_report)

        ## create fastqc for trimmed reads
        pd_samples_retrieved_trimmed = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)
        qc.fastqc(pd_samples_retrieved_trimmed, outdir, options,
                  start_time_partial, "trimmed", Debug)

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)
    print("\n+ Exiting trimm module.")
    return ()
Ejemplo n.º 21
0
def run_report(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################

    if (options.help_spaTyper):
        ## help_format option
        get_spa_typing.help_spaTyper()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    ## set default
    options.batch = False

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Report generation module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## call assemble using spades
    start_time_partial = start_time_total

    ## absolute path for in & out
    options.database = os.path.abspath(options.database)
    global input_dir
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    global Project
    if (options.detached):
        options.project = False
        outdir = os.path.abspath(options.output_folder)
        Project = False
    else:
        options.project = True
        outdir = input_dir
        Project = True

    ##
    print("\n+ Get project information:")

    ## get files: trimm, assembly, annotation
    pd_samples_retrieved = database_user.get_userData_files(options, input_dir)
    pd_samples_retrieved['new_name'] = pd_samples_retrieved['name']

    ## get info: profile, ident, cluster, MGE
    pd_samples_info = database_user.get_userData_info(options, input_dir)

    ## get databases to list
    #retrieve_databases = get_options_db(options)

    ## create output files
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "report",
                                            options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

        print(colored("**DEBUG: pd_samples_info **", 'yellow'))
        print(pd_samples_info)

    ## generate output folder, if necessary
    print(
        "\n\n\n+ Generate a report summarizing analysis and sample information"
    )
    if not options.project:
        HCGB_files.create_folder(outdir)
        outdir_report = outdir
    else:
        ### report generation
        outdir_report = HCGB_files.create_subfolder("report", outdir)

    ## create report with all data
    summary_report = HCGB_files.create_subfolder("summary_report",
                                                 outdir_report)
    print("Folder: ", summary_report)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ########################################
    ## create species specific report if any
    ########################################
    if (options.species_report):
        ## Saureus
        if options.species_report == "Saureus":
            Saureus_specific(pd_samples_retrieved, pd_samples_info, options,
                             summary_report, outdir_dict)

        ## else
        ## to add accordingly

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    ###########################################################
    ## create gene fasta sequences retrieval if desired
    ###########################################################
    if options.genes_ids_fasta:
        ## given a list of genes ids, retrieve sequence for all samples from profile
        if os.path.isfile(os.path.abspath(options.genes_ids_fasta)):
            in_file = os.path.abspath(options.genes_ids_fasta)
            gene_names = [line.rstrip('\n') for line in open(in_file)]
            print(
                '+ Retrieve selected genes sequences from the profile analysis for each sample.'
            )
            print('+ Searching gene:')

            ## get profiles available
            results_geneIDs = pd.DataFrame(columns=('sample', 'gene', 'id',
                                                    'sequence'))
            sample_frame = pd_samples_info.groupby(["name"])
            for g in gene_names:
                print("\t+", g)
                for name, cluster_df in sample_frame:
                    my_list_profiles = cluster_df.loc[
                        cluster_df['tag'] == 'profile']['ext'].to_list()
                    if options.debug:
                        print("name: ", name)
                        print("my_list_profiles:")
                        print(my_list_profiles)

                    for p in my_list_profiles:
                        main_profile_folder = cluster_df.loc[
                            cluster_df['ext'] == p]['dirname'].to_list()[0]
                        p = p.lower()
                        if p == 'vfdb':
                            p = p + '_full'

                        profile_folder = os.path.join(main_profile_folder, p)
                        (seq_id, seq_sequence
                         ) = retrieve_genes.retrieve_genes_ids_sequences(
                             profile_folder, g, Debug)
                        if (seq_id):
                            ## save results
                            results_geneIDs.loc[len(results_geneIDs)] = (
                                name, g, seq_id, seq_sequence)

        ## save for each gene in a separate fasta file
        list_of_genes = set(results_geneIDs['gene'].to_list())

        ## debug
        if Debug:
            print("** DEBUG **")
            print(results_geneIDs)
            print(list_of_genes)

        ## Save results
        genes_folder = HCGB_files.create_subfolder('genes', summary_report)
        for gene_retrieved in list_of_genes:
            this_frame = results_geneIDs[results_geneIDs['gene'] ==
                                         gene_retrieved]

            gene_retrieved_file = os.path.join(genes_folder, gene_retrieved)
            gene_retrieved_fasta = gene_retrieved_file + ".fasta"
            gene_retrieved_info = gene_retrieved_file + "_info.txt"
            fasta_hd = open(gene_retrieved_fasta, 'w')
            info_hd = open(gene_retrieved_info, 'w')

            for item, row in this_frame.iterrows():
                string2write = ">" + row['sample'] + '_' + row[
                    'gene'] + '\n' + row['sequence'] + '\n'
                string2write_info = row['sample'] + '\t' + row[
                    'gene'] + '\t' + row['id'] + '\n'
                fasta_hd.write(string2write)
                info_hd.write(string2write_info)

            fasta_hd.close()
            info_hd.close()

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

        ########################################
        ## create gene promoter fasta sequences retrieval if desired
        ########################################
        if options.promoter_bp:
            ## retrieve as many bp as necessary from genes_ids_fasta
            print("** THIS OPTION IS NOT IMPLEMENTED YET... **")
            #get_promoter.get_promoter(file, geneOfInterest, basePairs, sampleName, option, debug=False):

    ########################################
    ## create gene specific report if any
    ########################################
    if options.genes_ids_profile:
        if options.species_report == "Saureus":
            if Debug:
                print("** options.genes_ids_profile **")
                print("Analysis already done for Saureus")
        else:
            in_file = os.path.abspath(options.genes_ids_profile)
            gene_names = [line.rstrip('\n') for line in open(in_file)]
            results_Profiles = retrieve_genes.get_genes_profile(
                pd_samples_info, gene_names, options.debug, "name")
            if options.debug:
                print("results_Profiles")
                print(results_Profiles)

            ## open excel writer
            name_excel = summary_report + '/gene_ids_profile.xlsx'
            writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')
            results_Profiles.to_excel(writer, sheet_name="gene_ids")

            ## close
            writer.save()

            ## time stamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

    ###############################################
    ## Search for any additional fasta sequence
    ###############################################
    if options.genes_fasta:
        ## given a list of fasta sequences search using blast against proteins annotated or genome
        print("** THIS OPTION IS NOT IMPLEMENTED YET... **")

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Report generation module.")
    return ()
Ejemplo n.º 22
0
def run_annotation(options):

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_BUSCO):
        ## information for BUSCO
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()

    elif (options.help_Prokka):
        ## information for Prokka
        annotation.print_list_prokka()
        exit()

    ## set default
    options.batch = False

    ###
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Assembly annotation")

    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ### symbolic links
    print("+ Retrieve all genomes assembled...")

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)

    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "annot",
                                            options.debug)

    ## annotate
    print("+ Annotate assemblies using prokka:")
    print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode")
    if options.genera == 'Other':
        print(
            "\t-Option: genera = Off; No genus-specific BLAST databases option provided"
        )
    else:
        print("\t-Option: genera = ", options.genera,
              "; Genus-specific BLAST databases option provided")

    print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature")
    print("\t-Option: addmrna;  Add 'mRNA' features for each 'CDS' feature")
    print("\t-Option: cdsrnaolap;  Allow [tr]RNA to overlap CDS")

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(annot_caller, row['sample'],
                            outdir_dict[row['name']], options, row['name'],
                            threads_job): index
            for index, row in pd_samples_retrieved.iterrows()
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get folders
    givenList = [v for v in outdir_dict.values()]
    protein_files = []
    print(
        "+ Detail information for each sample could be identified in separate folders:"
    )
    for folder in givenList:
        print('\t + ', folder)
        protein_files.extend(
            HCGB_main.retrieve_matching_files(folder, '.faa', Debug))

    ### report generation
    if (options.skip_report):
        print("+ No annotation report generation...")
    else:
        ### report generation
        HCGB_aes.boxymcboxface("Annotation report")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        PROKKA_report = HCGB_files.create_subfolder("annotation",
                                                    outdir_report)
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % PROKKA_report)

        ## check if previously report generated
        filename_stamp = PROKKA_report + '/.success'
        done = 0
        if os.path.isdir(PROKKA_report):
            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous report generated results on: %s" % stamp,
                        'yellow'))
                done = 1

        ## generate report
        if done == 0:
            ## get subdirs generated and call multiQC report module
            multiQC_report.multiQC_module_call(givenList, "Prokka",
                                               PROKKA_report, "-dd 2")
            print(
                '\n+ A summary HTML report of each sample is generated in folder: %s'
                % PROKKA_report)

            ## success stamps
            filename_stamp = PROKKA_report + '/.success'
            stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## time stamp
    start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total)

    ## Check each annotation using BUSCO
    results = qc.BUSCO_check(input_dir, outdir, options,
                             start_time_partial_BUSCO, "proteins")

    ## print to file: results

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Annotation module.")
    return ()
Ejemplo n.º 23
0
def NCBI_DB(strains2get, data_folder, Debug):
    """Donwloads given taxa from NCBI if not available and updates database information.
	
	This function checks in the given folder if strain of interest is available. If not it would connect to NCBI using python module ncbi_genome_download and downloads some information.
	
	:param strains2get: dataframe containing genus, species and NCBI assembly columns among others. See example below.
	:param data_folder: Absolute path to database NCBI folder.
	:param Debug: Print messages for debugging purposes if desired. 
	:type strains2get: dataframe
	:type data_folder: string
	:type Debug: bool
	:return: Dataframe of genbank database updated for all available entries.

	Columns for the dataframe :file:`strains2get` consist of:
	
	sample,genus,species,strain,BioSample,genome,Plasmids
 
	See and example in file: :file:`/devel/results/strains2get_NCBI_DB.csv` and shown here:
	
	.. include:: ../../devel/results/strains2get_NCBI_DB.csv
		:literal:
		
	See example of the return dataframe, containing database information updated in file: :file:`/devel/results/genbank_database.csv` here:
	
	.. include:: ../../devel/results/genbank_database.csv
		:literal:
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.file_funtcions.create_folder`
	
		- :func:`HCGB.functions.main_functions.get_data`
	
		- :func:`BacterialTyper.scripts.database_generator.get_dbs`
	
		- :func:`BacterialTyper.scripts.database_generator.get_database`
		
		- :func:`BacterialTyper.scripts.database_generator.NCBIdownload`
		
		- :func:`BacterialTyper.scripts.database_generator.update_db_data_file`
		
	.. include:: ../../links.inc	 	
	
	"""

    ## set index
    strains2get = strains2get.set_index(
        'NCBI_assembly_ID', drop=False)  ## set new index but keep column
    strains2get.index.names = ['ID']  ## rename index
    strains2get = strains2get.drop_duplicates()

    #########
    if Debug:
        print(colored("DEBUG: NCBI data provided: ", 'yellow'))
        print(strains2get)

    ## get data existing database
    print("+ Create the database in folder: \n", data_folder)
    HCGB_files.create_folder(data_folder)

    ## read database
    db_frame = getdbs('NCBI', data_folder, 'genbank', Debug)
    database_df = get_database(db_frame, Debug)

    #########
    if Debug:
        print(colored("DEBUG: NCBI genbank database retrieved: ", 'yellow'))
        print("db_frame")
        print(db_frame)
        print()

        print("database_df")
        print(database_df)

    ## loop and download
    for index, row in strains2get.iterrows():
        HCGB_aes.print_sepLine("+", 75, False)
        acc_ID = index  #strains2get.loc[index]['NCBI_assembly_ID']
        info = "Genus: " + strains2get.loc[index][
            'genus'] + '\n' + "Species: " + strains2get.loc[index][
                'species'] + '\n' + "Strain: " + strains2get.loc[index][
                    'name'] + '\n' + "ID accession: " + acc_ID + '\n'
        dir_path = data_folder + '/genbank/bacteria/' + acc_ID  ## module ngd requires to download data in bacteria subfolder under genbank folder

        ## check if already exists
        if acc_ID in database_df.index:
            print("\n+ Data is already available in database for: ")
            print(colored(info, 'green'))

        else:
            ## download
            print("\n+ Downloading data for:")
            print(colored(info, 'green'))
            data_accID = NCBIdownload(acc_ID, strains2get, data_folder)
            this_db = HCGB_main.get_data(data_accID, ',', 'index_col=0')
            this_db = this_db.set_index('ID')
            database_df = database_df.append(this_db)

    ## Generate/Update database
    database_csv = data_folder + '/genbank_database.csv'
    db_updated = update_db_data_file(database_df, database_csv)
    print("+ Database has been generated in file: ", database_csv)
    return (db_updated)
def download_kma_database(folder, database, debug):
	"""
	Downloads databases from KMA website.
	
	Using the latest available ftp datasets, this function downloads available datasets using
	function :func:`BacterialTyper.scripts.functions.wget_download`. 
	
	Ftp site: "ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/latest/"
	
	It also downloads the md5sum for the dataset selected and compares with the 
	
	:param folder: Absolute path to folder that contains database.
	:param database: Possible options: [bacteria, archaea, protozoa, fungi, plasmids, typestrains, viral].
	:param debug: True/false for printing debugging messages.
	
	:type folder: string
	:type database: string
	:type debug: boolean
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.wget_download`

		- :func:`BacterialTyper.scripts.functions.check_md5sum`

		- :func:`BacterialTyper.scripts.functions.extract`
		
		- :func:`BacterialTyper.scripts.functions.print_time_stamp`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`

		- :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed`

	"""

	## ToDo: update with latest version
	ftp_site = "http://www.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/latest/"
	
	## In v20190107 there was a plasmid database.
	#ftp_site = "ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/20190107/"

	############################################################################
	## ToDo: Set automatic: download config file and look for prefix for each 
	## sample and generate a dictionary to code the prefix for each db.
	############################################################################
	
	# Database configuration file - Describes the content of the database
	# Each db consist of 5 files with the following extensions: b, comp.b, length.b, seq.b, name
	# Other important files are: .name, .kma.entries.all, .kma.entries.deleted, .kma.entries.added, .md5
	# db_prefix	name	description
	#bacteria.ATG	Bacteria Organisms	Bacteria organisms library prefix=ATG
	#plasmids.T	Bacteria Plasmids	Bacteria plasmids library prefix=T
	#typestrains.ATG	Bacteria Type Strains	Bacteria type strains library prefix=ATG
	#fungi.ATG	Fungi	Fungi library prefix=ATG
	#protozoa.ATG	Protozoa	Protozoa library prefix=ATG
	#archaea.ATG	Archaea	Archaea library prefix=ATG	
	
	HCGB_files.create_folder(folder)
	
	## debug message
	if (debug):
		print (colored("Function call: download_kma_database " + folder + ' ' + database + '\n','yellow'))

	## prefix
	if (database == 'plasmids'):
		prefix = '.T'
	elif (database == 'viral'):
		prefix = '.TG'
	else:
		prefix = '.ATG'
		
	index_name = os.path.join(folder, database + prefix)

	## check if already download
	return_code_down = False
	if os.path.exists(folder):
		return_code_down = check_db_indexed(index_name, folder)
		## debug message
		if (debug):
			print (colored("Folder database is already available:" + folder,'yellow'))
		
	if (return_code_down == False): ## folder does not exists

		## Download data
		print ("\t+ Downloading data now, it may take a while....")

		## debug message
		if (debug):
			print (colored("Download files via function wget_download:",'yellow'))
		
		## connect to url
		url = ftp_site + database + '.tar.gz'
		HCGB_sys.wget_download(url, folder)

		md5_url = ftp_site + database + '.md5'
		HCGB_sys.wget_download(md5_url, folder)
		print ("\n\t+ Data downloaded.....")

		## get files
		files = os.listdir(folder)
		md5_sum = ""
		for f in files:
			if f.endswith('tar.gz'):
				tar_file = folder + '/' + f
			elif f.endswith('md5'):
				md5_sum = folder + '/' + f
		
		## check md5sum
		print ("\t+ Checking for integrity using md5sum")
		
		# get md5 sum from source
		md5_string = ""
		with open(md5_sum, 'r') as myfile:
			line = myfile.read()
		
		line = re.sub(r"\s", ',', line)
		md5_string = line.split(",")[0]
		
		## calculate md5 for file
		result_md5 = HCGB_sys.check_md5sum(md5_string, tar_file) ## FIXME: Not conda supported
		if (result_md5 == True):
		
			## debug message
			if (debug):
				print (colored("result md5sum matches code provided for file " + tar_file,'yellow'))

			# extract
			print ("\t+ Extracting database into destination folder: " + folder)
			HCGB_files.extract(tar_file, folder)	

		else:
			print (colored("*** ERROR: Some error occurred during the downloading and file is corrupted ***", 'red'))
			return ("Error")
			
		## database should be unzipped and containing files...
		return_code_extract = check_db_indexed(index_name, folder)
		
		if (return_code_extract):
			print("+ Database (%s) successfully extracted in folder: %s..." %(database, folder))
		else:
			string = "*** ERROR: Some error occurred during the extraction of the database (%s). Please check folder (%s) and downloading and file is corrupted ***" %(database, folder)
			print (colored(string, 'red'))
			return ("Error")
		
		## print timestamp
		filename_stamp = folder + '/.success'
		stamp =	HCGB_time.print_time_stamp(filename_stamp)
Ejemplo n.º 25
0
def mapReads(option, reads, folder, name, STAR_exe, genomeDir, limitRAM_option,
             num_threads, Debug):
    """
    Map reads using STAR software. Some parameters are set for small RNA Seq.

    Parameters set according to ENCODE Project directives for small RNAs
    https://www.encodeproject.org/rna-seq/small-rnas/
    
    :param option: If multiple files to map, use loaded genome (LoadAndKeep) if only one map, anything else.
    :param reads: List containing absolute path to reads (SE or PE)
    :param folder: Path for output results
    :param name: Sample name
    :param STAR_exe: Executable path for STAR binary
    :param genomeDir: 
    :param limitRAM_option: maximum available RAM (bytes) for map reads process. Default: 40000000000
    :param num_threads:
    
    :type option: string
    :type reads: list
    :type folder: string 
    :type name: string 
    :type STAR_exe: string
    :type genomeDir: string 
    :type limitRAM_option: int
    :type num_threads: int
    
    
    """
    ## open file
    print("\t+ Mapping sample %s using STAR" % name)

    if not os.path.isdir(folder):
        folder = files_functions.create_folder(folder)
    ##
    bam_file_name = os.path.join(folder, 'Aligned.sortedByCoord.out.bam')

    ## read is a list with 1 or 2 read fastq files
    jread = " ".join(reads)

    ## prepare command
    cmd = "%s --genomeDir %s --runThreadN %s " % (STAR_exe, genomeDir,
                                                  num_threads)
    cmd = cmd + "--limitBAMsortRAM %s --outFileNamePrefix %s " % (
        limitRAM_option, folder + '/')

    ## some common options
    cmd = cmd + "--alignSJDBoverhangMin 1000 --outFilterMultimapNmax 1 --outFilterMismatchNoverLmax 0.03 "
    cmd = cmd + "--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMatchNmin 16 "
    cmd = cmd + "--alignIntronMax 1 --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMtype BAM SortedByCoordinate "

    ## Multiple samples or just one?
    if option == 'LoadAndKeep':
        cmd = cmd + "--genomeLoad LoadAndKeep"
    else:
        cmd = cmd + "--genomeLoad NoSharedMemory"

    ## ReadFiles
    cmd = cmd + " --readFilesIn %s " % jread

    ## logfile & errfile
    logfile = os.path.join(folder, 'STAR.log')
    errfile = os.path.join(folder, 'STAR.err')
    cmd = cmd + ' > ' + logfile + ' 2> ' + errfile

    ## sent command
    mapping_code = system_call_functions.system_call(cmd, False, True)

    return (mapping_code)
Ejemplo n.º 26
0
def parse_options(arg_dict):

    outdir = os.path.abspath(arg_dict.output_folder)

    ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options
    ## ATTENTION: df_accID merge generated dataframe

    ## --------------------------------------- ##
    ## GFF or GBF file
    ## --------------------------------------- ##
    if (arg_dict.annot_file):
        arg_dict.annot_file = os.path.abspath(arg_dict.annot_file)

        # *************************** ##
        ## multiple files provided
        # *************************** ##
        if (arg_dict.batch):
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple annotation file provided option:',
                              'yellow')
                debug_message('arg_dict.annot_file: ' + arg_dict.annot_file,
                              'yellow')

            ## check if ok
            BacDup_functions.file_readable_check(arg_dict.annot_file)

            print(
                colored('\t* Multiple annotation files provided .......[OK]',
                        'green'))
            dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',')

            ## debug messages
            if (arg_dict.debug):
                debug_message('dict_entries: ', 'yellow')
                debug_message(dict_entries, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++\n\n')

        # *************************** ##
        ## single file provided
        # *************************** ##
        else:
            dict_entries = {}
            print(colored('\t* Annotation file:.......[OK]', 'green'))
            if (arg_dict.sample_name):
                sample_name = arg_dict.sample_name
            else:
                sample_name = "sample"

            ##
            dict_entries[sample_name] = arg_dict.annot_file

        ## create dataframe df_accID to match other formats
        df_accID = pd.DataFrame(
            columns=(BacDup_functions.columns_accID_table()))

        for name, file_annot in dict_entries.items():
            file_annot = os.path.abspath(file_annot)

            ## init all
            genome = ""
            prot = ""
            gff = ""
            gbk = ""
            plasmid_count = ""
            plasmid_id = ""

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message(
                    'dict_entries check annotation files provided option:',
                    'yellow')
                debug_message('name: ' + name, 'yellow')
                debug_message('file_annot: ' + file_annot, 'yellow')

            ## check file is valid
            BacDup_functions.file_readable_check(file_annot)

            ## get format
            format = format_checker.is_format(file_annot, arg_dict.debug)

            if (arg_dict.debug):
                debug_message('format: ' + format, 'yellow')

            ## parse accordingly
            taxonomy = ""
            organism = ""
            taxonomy_string = ""
            genus = ""
            if (format == 'gbk'):
                ## get information from each sample
                (taxonomy,
                 organism) = BacDup.scripts.functions.get_gbk_information(
                     file_annot, arg_dict.debug)
                ## plasmid_count, plasmid_id not available

            elif (format == 'gff'):
                if (arg_dict.ref_file):
                    arg_dict.ref_file = os.path.abspath(arg_dict.ref_file)
                    BacDup_functions.file_readable_check(arg_dict.ref_file)

                    if (arg_dict.batch):
                        ref_entries = HCGB_main.file2dictionary(
                            arg_dict.ref_file, ',')
                        genome = ref_entries[name]
                    else:
                        genome = arg_dict.ref_file

            ## save into dataframe
            if len(taxonomy) > 1:
                genus = taxonomy[-1]
                taxonomy_string = ";".join(taxonomy)

            dir_path = os.path.abspath(os.path.dirname(file_annot))
            df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism,
                                           taxonomy_string, genome, file_annot,
                                           format, prot, plasmid_count,
                                           ";".join(plasmid_id))

    ## --------------------------------------- ##
    ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1
    ## --------------------------------------- ##
    elif (arg_dict.GenBank_id):
        ## get database path
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder(
                "db", os.path.abspath(arg_dict.output_folder))

        ## debug messages
        if (arg_dict.debug):
            debug_message('+++++++++++++++++++++++++++++++')
            debug_message('GenBank ID option:', 'yellow')
            debug_message('db_folder: ' + db_folder, 'yellow')

        # *************************** ##
        ## batch file
        # *************************** ##
        if (arg_dict.batch):
            arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id)

            ## debug messages
            if (arg_dict.debug):
                debug_message('GenBank ID batch file provided:', 'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.GenBank_id)

            print(
                colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]',
                        'green'))
            print()

            ## call IDs into a list and create tmp folder
            strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id)
            strains2get = list(filter(None, strains2get))

            ## debug messages
            if (arg_dict.debug):
                debug_message('strains2get: ' + str(strains2get), 'yellow')

            ## call NCBI_downloader
            df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
                strains2get, db_folder, arg_dict.debug,
                arg_dict.assembly_level)

        # *************************** ##
        ## single GenBank ID
        # *************************** ##
        else:
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Single NCBI GenBank IDs provided option:',
                              'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')
                debug_message('db_folder: ' + db_folder, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++')

            ## download
            print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green'))
            print()
            HCGB_aes.print_sepLine("+", 75, False)
            df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload(
                arg_dict.GenBank_id, db_folder, arg_dict.debug)

    ## --------------------------------------- ##
    ## NCBI Taxonomy ID:
    ## --------------------------------------- ##
    elif (arg_dict.tax_id):
        #################
        ## get tax ids
        #################
        if (arg_dict.batch):
            print(
                colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]',
                        'green'))

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple NCBI Taxonomy IDs provided option:',
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.tax_id)

            ## get IDs into a list
            taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id)

        else:
            print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green'))
            taxIDs2get = [arg_dict.tax_id]

        print()

        ##################################
        ## init ete NCBI taxonomy database
        ##################################
        print('+ Initiate NCBI taxonomy database...')
        ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug)

        string_info_total = []
        for taxid in taxIDs2get:
            ## parse
            info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel',
                                                  arg_dict.debug)
            print()

            ## debug messages
            if arg_dict.debug:
                debug_message(
                    "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                )
                debug_message('info\n', "yellow")
                print(info)

            ## append if more
            string_info_total.extend(info)

        ## convert to list of strings
        string_info_total = [str(int) for int in string_info_total]

        ## assume all belong to same superkingdom if children of same tax_id
        group_obtained = taxonomy_retrieval.get_superKingdom(
            string_info_total[0], ncbi, arg_dict.debug)

        #################
        ## get database path
        #################
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder("db", outdir)

        ## debug messages
        if arg_dict.debug:
            debug_message(
                "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            )
            debug_message('group_obtained: ' + group_obtained, "yellow")
            debug_message('db_folder: ' + db_folder, "yellow")
            debug_message(
                'arg_dict.assembly_level: ' + arg_dict.assembly_level,
                "yellow")
            debug_message('arg_dict.section: ' + arg_dict.section, "yellow")

        ##################################
        ## get GenBank entries selected
        ##################################
        (strains2get,
         allstrains_available) = taxonomy_retrieval.get_GenBank_ids(
             db_folder,
             string_info_total,
             int(arg_dict.k_random),
             arg_dict.debug,
             assembly_level_given=arg_dict.assembly_level,
             group_given=group_obtained,
             section_given=arg_dict.section)

        ## print list and dictionary of possible and selected taxIDs
        outdir = os.path.abspath(arg_dict.output_folder)
        info_dir = HCGB_files.create_subfolder("info", outdir)
        input_info_dir = HCGB_files.create_subfolder("input", info_dir)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'all_entries.txt'),
            allstrains_available)

        ## save into file
        file_info = os.path.join(input_info_dir, 'info.txt')

        ## stop here if dry_run
        if arg_dict.dry_run:
            print()
            HCGB_aes.print_sepLine("*", 75, False)
            print(
                "ATTENTION: Dry run mode selected. Stopping the process here.")
            HCGB_aes.print_sepLine("*", 75, False)
            print("+ All available entries listed and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'all_entries.txt'))
            print("+ Subset of entries generated and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'Downloaded.txt'))
            print(
                "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n"
            )
            HCGB_aes.print_sepLine("*", 75, False)
            print()
            exit()

        #################
        ## call NCBI_downloader
        #################
        df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
            strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level)

    ## --------------------------------------- ##
    ## Previous BacDup analysis folder
    ## --------------------------------------- ##
    ## TODO
    elif (arg_dict.project):
        print(
            colored(
                '\t* A previous BacDup analysis project folder:.......[OK]',
                'green'))
        ## create df_accID to store data
        ## TODO

    ## Returns dataframe with information

    df_accID = df_accID.set_index('new_name')
    return (df_accID)
Ejemplo n.º 27
0
def ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases,
                start_time_partial):
    HCGB_aes.boxymcboxface("ARIBA Identification")

    ##################
    ## check status	##
    ##################
    databases2use = []  ## path, db name
    card_trick_info = ""
    print('+ Check databases status: ')
    for index, db2use in retrieve_databases.iterrows():
        ## index_name
        if (db2use['source'] == 'ARIBA'):
            index_status = ariba_caller.check_db_indexed(db2use['path'], 'YES')
            if (index_status == True):
                #print (colored("\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green'))
                databases2use.append([db2use['path'], db2use['db']])

                ## prepare card database ontology for later
                if (db2use['db'] == 'card'):
                    card_trick_info = card_trick_caller.prepare_card_data(
                        options.database)

        ## check status of other databases if any
        # else:

    ## debug message
    if (Debug):
        print(colored("**DEBUG: databases2use\n**", 'yellow'))
        print(databases2use)
        if (card_trick_info):
            print(
                colored("**DEBUG: card_trick_info: " + card_trick_info + " **",
                        'yellow'))

    ######################################################
    ## Start identification of samples
    ######################################################
    print("\n+ Send ARIBA identification jobs...")

    ## get outdir folders
    outdir_samples = pd.DataFrame(columns=('sample', 'dirname', 'db',
                                           'output'))

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    for name, cluster in sample_frame:
        for db2use in databases2use:
            tmp = get_outfile(outdir_dict[name], name, db2use[0])
            outdir_samples.loc[len(outdir_samples)] = (name, outdir_dict[name],
                                                       db2use[1], tmp)

    ## multi-index
    outdir_samples = outdir_samples.set_index(['sample', 'db'])

    ## debug message
    if (Debug):
        print(colored("**DEBUG: outdir_samples **", 'yellow'))
        print(outdir_samples)

    ######################################################
    ## send for each sample
    ######################################################
    ## ariba assembly cutoff
    if not (options.ARIBA_cutoff):
        options.ARIBA_cutoff = 0.90

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## loop
    results_df = pd.DataFrame()
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        for db2use in databases2use:
            print(colored("+ Working with database: " + db2use[1], 'yellow'))
            ## send for each sample
            commandsSent = {
                executor.submit(
                    ariba_run_caller,
                    db2use[0],
                    db2use[1],  ## database path & dbname
                    sorted(cluster["sample"].tolist()),  ## files
                    outdir_samples.loc[(name, db2use[1]), 'output'],  ## output
                    threads_job,
                    options.ARIBA_cutoff): name
                for name, cluster in sample_frame
            }

            for cmd2 in concurrent.futures.as_completed(commandsSent):
                details = commandsSent[cmd2]
                try:
                    data = cmd2.result()
                except Exception as exc:
                    print('***ERROR:')
                    print(cmd2)
                    print('%r generated an exception: %s' % (details, exc))

            print("+ Jobs finished for database %s ..." % db2use[1])

            ## functions.timestamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

            print()
            print(
                "+ Collecting information for each sample analyzed for database: "
                + db2use[1])
            ## check results for each database
            results_df_tmp = virulence_resistance.check_results(
                db2use[1], outdir_samples, options.ARIBA_cutoff,
                card_trick_info)
            results_df = pd.concat([results_df, results_df_tmp])

            ## functions.timestamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

    ######################################################
    ## Generate final report for all samples
    ######################################################
    ## ariba summary results all samples
    print(
        "\n + Generate a summary file for all samples and one for each database employed..."
    )

    ## parse results
    if Project:
        final_dir = input_dir + '/report/profile'
        HCGB_files.create_folder(final_dir)
    else:
        final_dir = os.path.abspath(options.output_folder)

    ##
    vfdb = False
    subfolder = HCGB_files.create_subfolder("ariba_summary", final_dir)
    ## subfolder_samples = functions.create_subfolder("samples", final_dir) ## TODO: Copy all xlsx files to a common folder. Is it necessary?

    ## open excel writer
    name_excel = final_dir + '/profile_summary.xlsx'
    writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')

    for database, data in outdir_samples.groupby(level='db'):  ## fix
        report_files_databases = {}

        for sample, data2 in data.groupby(level='sample'):  ## fix
            file_report = data2.loc[sample, database]['output'] + '/report.tsv'
            if os.path.isfile(file_report):  ## check if exists
                report_files_databases[sample] = file_report

        outfile_summary = subfolder + "/"
        if database.endswith('card_prepareref/'):
            outfile_summary = outfile_summary + 'CARD_summary'
            name_db = 'CARD'
        elif database.endswith('vfdb_full_prepareref/'):
            outfile_summary = outfile_summary + 'VFDB_summary'
            name_db = 'VFDB'
            vfdb = True
        else:
            ## TODO: check if there are multiple 'other' databases
            ## Different databases provided (different to VFDB and CARD) would collapse file
            outfile_summary = outfile_summary + 'Other_summary'
            name_db = 'other'

        ## call ariba summary to summarize results
        csv_all = ariba_caller.ariba_summary_all(outfile_summary,
                                                 report_files_databases)
        if not csv_all == 'NaN':
            csv2excel = pd.read_csv(csv_all, header=0, sep=',')
            ## write excel
            name_tab = name_db + '_found'
            csv2excel.to_excel(writer, sheet_name=name_tab)

    ## results_df contains excel and csv files for each sample and for each database
    list_databases = set(results_df['database'].to_list())
    for db in list_databases:
        df_db = results_df[results_df['database'] == db]['csv']
        dict_samples = df_db.to_dict()

        merge_df = pd.DataFrame()
        for sample in dict_samples:

            if os.path.isfile(dict_samples[sample]):
                df = pd.read_csv(dict_samples[sample], header=0, sep=",")
                df = df.set_index('Genes')
                df2 = df.rename(columns={'Status': sample}, inplace=True)
                df2 = df[[sample]]

                ## add to a common dataframe
                merge_df = pd.concat([merge_df, df2], axis=1, sort=True)
                merge_df.fillna("NaN", inplace=True)

        trans_df = merge_df.transpose()
        ## write excel
        name_tab = db + '_all'
        trans_df.to_excel(writer, sheet_name=name_tab)

    ## close
    writer.save()

    ######################################################
    ## print additional information for VFDB
    ######################################################
    if (vfdb):
        print("\n\n")
        HCGB_aes.print_sepLine("*", 50, False)
        print("+ Check VFDB details in files downloaded from vfdb website:")
        files_VFDB = virulence_resistance.check_VFDB(final_dir +
                                                     '/VFDB_information')
        HCGB_aes.print_sepLine("*", 50, False)

    ######################################################
    print("\n+ Please check additional summary files generated at folder ",
          final_dir)
    print("+ Go to website: https://jameshadfield.github.io/phandango/#/")
    print(
        "+ For each database upload files *phandango.csv and *phandango.tre and visualize results"
    )
Ejemplo n.º 28
0
def run_input(arg_dict):
    """Main function of the input_parser module in BacDup package.
    
    This module prepares data for later gene duplication analysis. 
    
    It allows the user to provide either a single sample, multiple samples, NCBI 
    GenBank IDs or NCBI taxonomy IDs to retrieve and obtain the annotation data.    
    """

    ## help message
    if (arg_dict.input_help):
        help_input()
        exit()

    BacDup_functions.pipeline_header('BacDup')
    HCGB_aes.boxymcboxface("Preparing input files")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    #input_dir = os.path.abspath(options.input)
    outdir = os.path.abspath(arg_dict.output_folder)

    ## output folder
    print("\n+ Create output folder(s):")
    HCGB_files.create_folder(outdir)

    ## set defaults
    if not (arg_dict.assembly_level):
        arg_dict.assembly_level = 'complete'
    if not (arg_dict.section):
        arg_dict.section = 'genbank'

    ## project or detached?
    if arg_dict.detached:
        arg_dict.project = False
        final_dir = outdir
        data_dir = outdir
    else:
        arg_dict.project = True
        print(
            "+ Generate a directory containing information within the project folder provided"
        )
        final_dir = HCGB_files.create_subfolder("info", outdir)

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('Project/Detached option:', 'yellow')
        debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow')
        debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow')
        debug_message('outdir:' + outdir, 'yellow')
        debug_message('final_dir:' + final_dir, 'yellow')
        debug_message('+++++++++++++++++++++++++++++++')

    ## get files
    print()
    HCGB_aes.print_sepLine("-", 50, False)
    print('+ Getting input information provided... ')
    print('+ Several options available:')
    print('\t* Single/Multiple Annotation file:')
    print('\t  |-- GenBank format files')
    print('\t  |-- GFF files +  Reference fasta files required')
    print('\n\t* Single/Multiple NCBI GenBank IDs')
    print('\n\t* Single/Multiple NCBI taxonomy IDs + Options')
    print('\n\t* A previous BacDup project folder')

    print('\n+ Check the option provided...')
    time.sleep(1)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    #################################################
    ## Parse and obtain the type of input information provided
    #################################################
    df_accID = parse_options(arg_dict)
    ## pd.DataFrame: 'new_name','folder','genus',
    ##               'species','taxonomy','genome',
    ##               'annot_file','format_annot_file', 'proteins',
    ##               'plasmids_number','plasmids_ID'))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ## parse information accordingly
    parse_information(arg_dict, df_accID, outdir)

    ### report generation
    HCGB_aes.boxymcboxface("Summarizing input files")
    outdir_report = HCGB_files.create_subfolder("report", outdir)

    input_report = HCGB_files.create_subfolder("input", outdir_report)

    ## add df_accID.loc[sample,] information as csv into input folder
    df_accID.to_csv(os.path.join(input_report, 'info.csv'),
                    index=True,
                    header=True)

    ## maybe add a summary of the files?

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Input module.")
    return ()
Ejemplo n.º 29
0
def run_cluster(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_Mash):
        ## information for Min Hash Software
        min_hash_caller.helpMash()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Clustering samples")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    if options.reads:
        if options.noTrim:
            ## raw reads
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "fastq",
                ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug)
        else:
            ## trimm reads
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "trim", ['_trim'], options.debug)

        ## keep only R1 reads if paired-end
        if options.pair:
            pd_samples_retrieved = pd_samples_retrieved.loc[
                pd_samples_retrieved['read_pair'] == "R1"]

    else:
        ## default
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    # exit if empty
    if pd_samples_retrieved.empty:
        print(
            "No data has been retrieved from the project folder provided. Exiting now..."
        )
        exit()

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)

    ## for each sample
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "mash",
                                            options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: outdir_dict **", 'yellow'))
        print(outdir_dict)

    ## get databases to check
    retrieve_databases = get_options_db(options)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## remove samples if specified
    if options.ex_sample:
        ex_samples = HCGB_main.get_info_file(options.ex_sample)
        retrieve_databases = retrieve_databases.loc[~retrieve_databases.index.
                                                    isin(ex_samples)]

    ## debug message
    if (Debug):
        print(colored("**DEBUG: retrieve_database **", 'yellow'))
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)
        print(retrieve_databases)

    ## check if all samples in user_data or genbank are indexed
    siglist_all = []
    for index, row in retrieve_databases.iterrows():
        if not row['path'] == 'NaN':
            if (Debug):
                HCGB_aes.print_sepLine("*", 25, False)
                print(row)

            if all([
                    int(options.kmer_size) == int(row['ksize']),
                    int(options.n_sketch) == int(row['num_sketch'])
            ]):
                siglist_all.append(
                    min_hash_caller.read_signature(row['path'],
                                                   options.kmer_size))
                continue

        ## index assembly or reads...
        (sigfile, siglist) = generate_sketch(row['folder'], row['original'],
                                             index, options.kmer_size,
                                             options.n_sketch, Debug)
        retrieve_databases.loc[index]['path'] = sigfile
        retrieve_databases.loc[index]['ksize'] = options.kmer_size
        retrieve_databases.loc[index]['num_sketch'] = options.n_sketch
        siglist_all.append(siglist)

    ### Cluster project samples
    print(colored("\n+ Collect project data", 'green'))
    print("+ Generate mash sketches for each sample analyzed...")
    pd_samples_retrieved = pd_samples_retrieved.set_index('name')

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieved **", 'yellow'))
        print(pd_samples_retrieved)

    ## init dataframe for project data
    colname = ["source", "name", "path", "original", "ksize", "num_sketch"]
    pd_samples_sketched = pd.DataFrame(columns=colname)
    for index, row in pd_samples_retrieved.iterrows():
        if index in retrieve_databases.index:
            print(
                colored(
                    '\t+ Sketched signature (%s) available within user data...'
                    % index, 'yellow'))
            continue

        this_sig = outdir_dict[index] + '/' + index + '.sig'
        if os.path.exists(this_sig):
            ## File signature might exist

            ## read original
            file2print = outdir_dict[index] + '/.original'
            if not os.path.exists(file2print):
                original = ['NaN']
            else:
                original = HCGB_main.readList_fromFile(file2print)
                if all([
                        int(options.kmer_size) == int(original[1]),
                        int(options.n_sketch) == int(original[2])
                ]):
                    siglist_all.append(
                        min_hash_caller.read_signature(this_sig,
                                                       options.kmer_size))
                    pd_samples_sketched.loc[len(pd_samples_sketched)] = (
                        'project_data', index, this_sig, row['sample'],
                        options.kmer_size, options.n_sketch)
                    print(
                        colored(
                            '\t+ Sketched signature available (%s) in project folder...'
                            % index, 'green'))
                    continue

        print(
            colored('\t+ Sketched signature to be generated: (%s)...' % index,
                    'yellow'))
        ## index assembly or reads...
        (sigfile, siglist) = generate_sketch(outdir_dict[index], row['sample'],
                                             index, options.kmer_size,
                                             options.n_sketch, Debug)
        pd_samples_sketched.loc[len(pd_samples_sketched)] = ('project_data',
                                                             index, sigfile,
                                                             row['sample'],
                                                             options.kmer_size,
                                                             options.n_sketch)
        siglist_all.append(siglist)

    print("\n+ Clustering sequences...")
    pd_samples_sketched = pd_samples_sketched.set_index('name')

    ####
    if retrieve_databases.empty:
        cluster_df = pd_samples_sketched
    else:
        tmp = retrieve_databases[[
            'source', 'db', 'path', 'original', 'ksize', 'num_sketch'
        ]]
        tmp = tmp.rename(columns={'db': 'name'})
        tmp.set_index('name')

        if (Debug):
            print(colored("**DEBUG: tmp **", 'yellow'))
            print(tmp)

        ## merge both dataframes
        cluster_df = pd.concat([pd_samples_sketched, tmp],
                               join='inner',
                               sort=True)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_sketched **", 'yellow'))
        print(pd_samples_sketched)

        print(colored("**DEBUG: cluster_df **", 'yellow'))
        print(cluster_df)

        print(colored("**DEBUG: Signatures **", 'yellow'))
        print(siglist_all)

        print(colored("**DEBUG: length siglist_all **", 'yellow'))
        print(len(siglist_all))

    ## Assign Colors colorLabels
    color_df = cluster_df.filter(["source"], axis=1)
    color_df["color"] = "r"  ## red::genbank

    ## project data
    project_data = list(color_df[color_df["source"] == "project_data"].index)
    color_df.loc[color_df.index.isin(project_data),
                 "color"] = "g"  ## green::project_data

    ## user_data
    user_data = list(color_df[color_df["source"] == "user_data"].index)
    color_df.loc[color_df.index.isin(user_data),
                 "color"] = "b"  ## blue::user_data

    colorLabels = color_df['color'].to_dict()

    if Debug:
        print(color_df)
        print(colorLabels)

    ## parse results
    if options.project:
        outdir_report = HCGB_files.create_subfolder("report", outdir)
        #final_dir = outdir + '/report/cluster'
        final_dir = functions.create_subfolder("cluster", outdir_report)
    else:
        final_dir = outdir

    ## compare
    name = 'cluster_' + str(HCGB_time.create_human_timestamp())
    tag_cluster_info = final_dir + '/' + name
    print('+ Saving results in folder: ', final_dir)
    print('\tFile name: ', name)
    (DataMatrix, labeltext) = min_hash_caller.compare(siglist_all,
                                                      tag_cluster_info, Debug)

    ## get colorLabels

    ## plot images
    pdf = True
    cluster_returned = min_hash_caller.plot(DataMatrix, labeltext,
                                            tag_cluster_info, pdf, colorLabels)

    ## generate newick tree
    min_hash_caller.get_Newick_tree(cluster_returned, DataMatrix, labeltext,
                                    tag_cluster_info)

    return ()
Ejemplo n.º 30
0
def run_ident(options):
    """
	Main function acting as an entry point to the module *ident*.
	
	Arguments:
	
	
	.. seealso:: Additional information to PubMLST available datasets.
	
		- :doc:`PubMLST datasets<../../../data/PubMLST_datasets>`
	
	
	"""

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    elif (options.help_KMA):
        ## information for KMA Software
        species_identification_KMA.help_kma_database()
        exit()

    elif (options.help_MLSTar):
        ## information for KMA Software
        MLSTar.help_MLSTar()
        exit()

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    ### species_identification_KMA -> most similar taxa
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Species identification")

    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    global Project

    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
        Project = False
    else:
        options.project = True
        outdir = input_dir
        Project = True

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "trim", ['_trim'], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    ## for each sample
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "ident",
                                            options.debug)

    ## let's start the process
    print(
        "+ Generate an species typification for each sample retrieved using:")
    print("(1) Kmer alignment (KMA) software.")
    print("(2) Pre-defined databases by KMA or user-defined databases.")

    ## get databases to check
    retrieve_databases = get_options_db(options)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: retrieve_database **", 'yellow'))
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)
        print(retrieve_databases)

    ######## KMA identification
    dataFrame_kma = KMA_ident(options, pd_samples_retrieved, outdir_dict,
                              retrieve_databases, start_time_partial)

    ## functions.timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: retrieve results to summarize **", 'yellow'))
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)
        print("dataframe_kma")
        print(dataFrame_kma)

    ## exit if viral search
    skip = False
    if (len(options.kma_dbs) == 1):
        for i in options.kma_dbs:
            if (i == 'viral'):
                print()
                MLST_results = ''
                options.fast = True
                skip = True

            ## what if only plasmids?

    ## do edirect and MLST if bacteria
    if (not skip):
        dataFrame_edirect = pd.DataFrame()

        ######## EDirect identification
        #dataFrame_edirect = edirect_ident(dataFrame_kma, outdir_dict, Debug)

        ## functions.timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

        ## debug message
        if (Debug):
            print(colored("**DEBUG: retrieve results from NCBI **", 'yellow'))
            pd.set_option('display.max_colwidth', None)
            pd.set_option('display.max_columns', None)
            print("dataFrame_edirect")
            print(dataFrame_edirect)

        ######## MLST identification
        MLST_results = MLST_ident(options, dataFrame_kma, outdir_dict,
                                  dataFrame_edirect, retrieve_databases)

        ## functions.timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

        ## debug message
        if (Debug):
            print(
                colored("**DEBUG: retrieve results to summarize **", 'yellow'))
            pd.set_option('display.max_colwidth', None)
            pd.set_option('display.max_columns', None)
            print("MLST_results")
            print(MLST_results)

    ## generate summary for sample: all databases
    ## MLST, plasmids, genome, etc
    HCGB_aes.boxymcboxface("Results Summary")

    #####################################
    ## Summary identification results  ##
    #####################################

    ## parse results
    if options.project:
        final_dir = os.path.join(outdir, 'report', 'ident')
        HCGB_files.create_folder(final_dir)
    else:
        final_dir = outdir

    ###
    excel_folder = HCGB_files.create_subfolder("samples", final_dir)
    print('+ Print summary results in folder: ', final_dir)
    print('+ Print sample results in folder: ', excel_folder)

    # Group dataframe results summary by sample name
    sample_results_summary = dataFrame_kma.groupby(["Sample"])

    ## debug message
    if (Debug):
        print(colored("**DEBUG: sample_results_summary **", 'yellow'))
        print(sample_results_summary)

    ##
    results_summary_KMA = pd.DataFrame()
    MLST_all = pd.DataFrame()
    for name, grouped in sample_results_summary:

        ## create a excel and txt for sample
        name_sample_excel = excel_folder + '/' + name + '_ident.xlsx'
        name_sample_csv = outdir_dict[
            name] + '/ident_summary.csv'  ## check in detached mode

        writer_sample = pd.ExcelWriter(
            name_sample_excel, engine='xlsxwriter')  ## open excel handle

        ## subset dataframe	& print result
        results_summary_toPrint_sample = grouped[[
            'Sample', '#Template', 'Query_Coverage', 'Template_Coverage',
            'Depth', 'Database'
        ]]
        results_summary_toPrint_sample.to_excel(
            writer_sample, sheet_name="KMA")  ## write excel handle
        results_summary_toPrint_sample.to_csv(
            name_sample_csv)  ## write csv for sample

        ## read MLST
        if MLST_results:
            if name in MLST_results:
                sample_MLST = pd.read_csv(MLST_results[name],
                                          header=0,
                                          sep=',')
                sample_MLST['genus'] = dataFrame_edirect.loc[
                    dataFrame_edirect['sample'] == name, 'genus'].values[0]
                sample_MLST['species'] = dataFrame_edirect.loc[
                    dataFrame_edirect['sample'] == name, 'species'].values[0]
                sample_MLST.to_excel(writer_sample,
                                     sheet_name="MLST")  ## write excel handle

                ## Return information to excel
                MLST_all = pd.concat([MLST_all, sample_MLST])

        ## close excel handle
        writer_sample.save()

    ##
    name_excel = final_dir + '/identification_summary.xlsx'
    print('+ Summary information in excel file: ', name_excel)
    writer = pd.ExcelWriter(name_excel,
                            engine='xlsxwriter')  ## open excel handle

    ## KMA dataframe: print result for sources
    results_summary_KMA = dataFrame_kma[[
        'Sample', '#Template', 'Query_Coverage', 'Template_Coverage', 'Depth',
        'Database'
    ]]

    ## Sum plasmid and chromosome statistics ##
    ## sum coverage
    total_coverage = results_summary_KMA.groupby(
        'Sample')['Query_Coverage'].sum().reset_index()

    ## debug message
    if (Debug):
        print("*** Sum: Query_coverage ***")
        print(total_coverage)

    ## TODO: FIX SUMMARY REPORT
    results_summary_KMA = results_summary_KMA.set_index('Sample')
    results_summary_KMA = results_summary_KMA.sort_values(
        by=['Sample', 'Database', 'Query_Coverage'],
        ascending=[True, True, True])
    results_summary_KMA.to_excel(writer,
                                 sheet_name='KMA')  ## write excel handle

    ## write MLST
    if (MLST_results):
        MLST_all.to_excel(writer, sheet_name='MLST')

    ## write excel and close
    writer.save()  ## close excel handle

    print("\n+ Check summary of results in file generated")

    ### timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ######################################
    ## update database for later usage
    ######################################
    if not options.fast:

        HCGB_aes.boxymcboxface("Update Sample Database")

        ## update db
        print("+ Update database with samples identified")

        ## debug message
        if (Debug):
            print(colored("**DEBUG: dataFrame_edirect **", 'yellow'))
            pd.set_option('display.max_colwidth', None)
            pd.set_option('display.max_columns', None)
            print(dataFrame_edirect)

        ## dataFrame_edirect
        file_toprint = final_dir + '/edirect_info2download.csv'
        dataFrame_edirect.to_csv(file_toprint)

        ## update database with samples identified
        data2download = dataFrame_edirect.filter(
            ['genus', 'species', 'strain', 'genome'])
        data2download = data2download.rename(columns={
            'genome': 'NCBI_assembly_ID',
            'strain': 'name'
        })
        NCBI_folder = os.path.abspath(options.database) + '/NCBI'
        database_generator.NCBI_DB(data2download, NCBI_folder, Debug)

    else:
        print(
            "+ No update of the database has been requested using option --fast"
        )

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting identification module.")
    return ()