Exemple #1
0
def get_outfile(output_dir, name, index_name):
    ##
    ##

    basename_tag = index_name.split("_prepareref/")[0]
    basename = os.path.basename(basename_tag)

    if Project:
        out_file = output_dir + '/' + basename
    else:
        output_path = HCGB_files.create_subfolder(name, output_dir)
        out_file = output_path + '/' + name + '_' + basename

    ## message debug
    if (Debug):
        print(
            colored(
                "**DEBUG: Input names " + name + '\n' + output_dir + '\n' +
                index_name + "\n", 'yellow'))
        print(
            colored(
                "**DEBUG: Output names \n" + basename + '\n' + basename_tag +
                '\n' + out_file + " **\n", 'yellow'))

    return (out_file)
def BUSCO_plots(dataFrame_results, outdir, threads):

	## DataFrame columns ('sample', 'dirname', 'name', 'ext', 'tag', 'busco_folder', 'busco_dataset', 'busco_summary', 'busco_results'))
	list_datasets = set(dataFrame_results['busco_dataset'].tolist())
	list_samples = set(dataFrame_results['name'].tolist())

	plot_folder = HCGB_files.create_subfolder('BUSCO_plots', outdir)
	outdir_busco_plot = []
	
	## summary for dataset
	print ("+ Get results for all samples summarized by dataset:")
	for dataset in list_datasets:
		print ("\t+ Get results for: ", dataset)
		plot_folder_dataset = HCGB_files.create_subfolder(dataset, plot_folder)
		outdir_busco_plot.append(plot_folder_dataset)
	
		for index, row in dataFrame_results.iterrows():
			if (dataset == row['busco_dataset']):
				shutil.copy(row['busco_summary'], 
						plot_folder_dataset + '/short_summary.specific.' + dataset + '.' + row['name'] + '.txt')
		
	print ("+ Get results for summarized by sample:")
	for sample in list_samples:
		print ("\t+ Get results for: ", sample)
		plot_folder_sample = HCGB_files.create_subfolder(sample, plot_folder)
		outdir_busco_plot.append(plot_folder_sample)

		for index, row in dataFrame_results.iterrows():
			if (sample == row['name']):
				shutil.copy(row['busco_summary'], 
						plot_folder_sample + '/short_summary.specific.' + dataset + '.' + row['name'] + '.txt')
						#plot_folder_sample + '/short_summary.' + row['busco_dataset'] + '.' + sample + '.txt')
	
	print ("+ Generate plots for each subset")
	path_here = os.getcwd()
	
	for plot in outdir_busco_plot:
		BUSCO_plot(plot) 
			
	print ("+ All plots generated...")
	print ("+ Check results under folders in : ", plot_folder)
	
	os.chdir(path_here)
	return()
Exemple #3
0
def load_Genome(folder, STAR_exe, genomeDir, num_threads):

    ## --genomeLoad LoadAndExit
    Load_folder = files_functions.create_subfolder('LoadMem', folder)
    cmd_LD = "%s --genomeDir %s --runThreadN %s --outFileNamePrefix %s --genomeLoad LoadAndExit" % (
        STAR_exe, genomeDir, num_threads, Load_folder)

    print('\t+ Loading memory for STAR mapping')
    load_code = system_call_functions.system_call(cmd_LD, False, True)
    return (load_code)
Exemple #4
0
def remove_Genome(STAR_exe, genomeDir, folder, num_threads):

    ## --genomeLoad Remove
    remove_folder = files_functions.create_subfolder('RemoveMem', folder)
    cmd_RM = "%s --genomeDir %s --outFileNamePrefix %s --runThreadN %s --genomeLoad Remove" % (
        STAR_exe, genomeDir, remove_folder, num_threads)

    ## send command
    print('\t+ Removing memory loaded for STAR mapping')
    remove_code = system_call_functions.system_call(cmd_RM, False, True)
    return (remove_code)
Exemple #5
0
def create_genomeDir(folder, STAR_exe, num_threads, fasta_file,
                     limitGenomeGenerateRAM):

    ##
    genomeDir = files_functions.create_subfolder("STAR_index", folder)

    cmd_create = "%s --runMode genomeGenerate --limitGenomeGenerateRAM %s --runThreadN %s --genomeDir %s --genomeFastaFiles %s" % (
        STAR_exe, limitGenomeGenerateRAM, num_threads, genomeDir, fasta_file)

    print('\t+ genomeDir generation for STAR mapping')
    create_code = system_call_functions.system_call(cmd_create, False, True)

    if not create_code:
        print("** ERROR: Some error ocurred during genomeDir creation... **")
        exit()

    return (genomeDir)
Exemple #6
0
def prepare_card_data(database_folder):
	
	## create CARD folder
	abs_folder = os.path.abspath(database_folder)
	CARD_folder = HCGB_files.create_subfolder('CARD', abs_folder)
	
	## make stamp time
	filename_stamp = CARD_folder + '/.success'

	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tA previous command generated results on: %s [CARD Ontology Data]" %stamp, 'yellow'))

		## check time passed
		days_passed = HCGB_time.get_diff_time(filename_stamp)
		print ("\t** %s days ago" %days_passed)		
		if (days_passed > 30): ## download again
			print ("\t ** Downloading information again just to be sure...")
			download=True
		else:
			print ("\t ** No need to download data again.")
			download=False
	else:
		download=True

	###
	if download:
		## uptade database in a path
		aro_obo_file = card_trick.ontology_functions.update_ontology(CARD_folder, False)
	
		## get ontology and save it in csv
		return_frame = card_trick.ontology_functions.parse_ontology(aro_obo_file, False)
	
		### if success return folder name
		if not return_frame.empty:
			## success stamps
			filename_stamp = CARD_folder + '/.success'
			stamp =	HCGB_time.print_time_stamp(filename_stamp)	
		else:
			return (FAIL)

	## return folder name
	return(CARD_folder)
Exemple #7
0
def snippy_variant_caller(reference, files, threads, outdir, name, contig_option, other_options, sample_name, Debug):
    
    ## create subfolder within phylo for this mapping
    tag = sample_name + '_vs_' + name
    subdir = HCGB_files.create_subfolder(tag, outdir)
       
    ## check if previously process and succeeded
    filename_stamp = subdir + '/.success'
    
    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print (colored("\tA previous command generated results on: %s [%s]" %(stamp, tag), 'yellow'))
    else:
         # Call variant calling
        code = variant_calling.snippy_call(reference, files, threads, subdir, 
                                           sample_name, contig_option, other_options, Debug)
        if code == 'OK':
            stamp = HCGB_time.print_time_stamp(filename_stamp)

        return(code)    
Exemple #8
0
def get_outfile(output_dir, name, index_name):
    """
	Generates the name for the output file created
	
	:param output_dir: Absolute path to results folder 
	:param name: Name of the sample
	:param index_name: Name of the database
	
	:type output_dir: string
	:type name: string
	:type index_name: string
	
	:retruns: Output file absolute path
	"""
    basename_tag = os.path.basename(index_name)
    if Project:
        output_path = output_dir
    else:
        output_path = HCGB_files.create_subfolder(name, output_dir)

    out_file = output_path + '/' + name + '_' + basename_tag
    return (out_file)
def download_ariba_databases(list_dbs, main_folder, Debug, threads):

	"""Download ARIBA_ databases.
	
	Using ARIBA software this function retrieves desired databases and prepare them for later analysis.
	
	:param list_dbs: List of databases to download.
	:param main_folder: Absolute path to database folder.
	:param Debug: True/false for printing developer messages
	:param threads: Number of CPUs to use.
	
	:type list_dbs: string 
	:type main_folder: string
	:type Debug: Boolean
	:type threads: integer
	
	 .. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.file_functions.create_subfolder`
		
		- :func:`HCGB.functions.time_functions.read_time_stamp`
		
		- :func:`BacterialTyper.scripts.ariba_caller.get_ARIBA_dbs`
	
		- :func:`BacterialTyper.scripts.ariba_caller.ariba_getref`		
		
	 
	.. include:: ../../links.inc
	"""

	print("\n\n+ Download databases for Antimicrobial Resistance Identification By Assembly (ARIBA).")
	ariba_folder = HCGB_files.create_subfolder("ARIBA", main_folder)

	## print ARIBA databases: 
	print ("+ Available databases:")
	dbs = get_ARIBA_dbs(list_dbs)
	
	for db_set in dbs:

		HCGB_aes.print_sepLine("-",30, False)
		print (colored("+ " + db_set,'yellow'))
		
		## prepare folders
		folder_set = HCGB_files.create_subfolder(db_set, ariba_folder)
		outdir_prepare_ref = folder_set + '_prepareref'

		## stamp time file
		filename_stamp_prepare = outdir_prepare_ref + '/.success'
	
		## check if previously done
		if os.path.isfile(filename_stamp_prepare):
			stamp =	HCGB_time.read_time_stamp(filename_stamp_prepare)
			print ("\t+ Database is downloaded in folder: ", folder_set)
			print ("\t+ Data is available and indexed in folder: ", outdir_prepare_ref)
			print (colored("\tDatabase was previously downloaded and prepared on: %s" %stamp, 'yellow'))
		
			## Check if necessary to download again after several months/days
			days_passed = HCGB_time.get_diff_time(filename_stamp_prepare)
			print ("\t\t** %s days ago" %days_passed)		
			if (days_passed > 30): ## download again
				print ("\t\t** Downloading information again just to be sure...")
				return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads)
			else:
				return_ariba_getref = 'OK'
		else:
			return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads)
		
		if (return_ariba_getref == 'OK'):
			print()
		else:
			print (colored("** ARIBA getref failed or generated a warning for " + db_set, 'red'))
Exemple #10
0
def run_database(options):

    ## init time
    start_time_total = time.time()
    start_time_partial = start_time_total

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
        print("[Debug mode: ON]")
    else:
        Debug = False

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Database")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    kma_bin = set_config.get_exe("kma")

    ######################################################
    ## print further information if requested
    if (options.help_ARIBA):
        print("ARIBA databases information:")
        ariba_caller.help_ARIBA()
        exit()

    elif (options.help_BUSCO):
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_KMA):
        species_identification_KMA.help_kma_database()
        exit()
    ######################################################

    ## create folder
    ## absolute
    options.path = os.path.abspath(options.path)
    HCGB_files.create_folder(options.path)

    #########
    if Debug:
        print(colored("DEBUG: absolute path folder: " + options.path,
                      'yellow'))

    ##########
    ## NCBI	##
    ##########
    ## if any NCBI options provided
    if any([options.ID_file, options.descendant]):
        ## create folders
        NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path)
        if (options.ID_file):
            ## get path and check if it is file
            abs_path_file = os.path.abspath(options.ID_file)
            if os.path.isfile(abs_path_file):
                print()
                HCGB_aes.print_sepLine("*", 50, False)
                print("--------- Check NCBI ids provided ---------\n")
                HCGB_aes.print_sepLine("*", 70, False)
                ## get file information
                print("\t+ Obtaining information from file: %s" %
                      abs_path_file)
                strains2get = HCGB_main.get_data(abs_path_file, ',', '')
                dataBase_NCBI = database_generator.NCBI_DB(
                    strains2get, NCBI_folder, Debug)

                #########
                if Debug:
                    print(colored("DEBUG: NCBI data provided: ", 'yellow'))
                    print(options.ID_file)

                ## functions.timestamp
                start_time_partial = HCGB_time.timestamp(start_time_partial)
                ## strains downloaded would be included to a kma index

        ## Get all entries belonging to this taxon provided
        if (options.descendant):
            #########
            if Debug:
                print(colored("DEBUG: NCBI descendant option: ON ", 'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print(
                "--------- Check descendant NCBI taxonomy ids provided ---------\n"
            )
            HCGB_aes.print_sepLine("*", 70, False)
            ## [TODO]
            dataBase_NCBI = database_generator.NCBI_descendant(
                options.descendant, NCBI_folder, Debug)

        ##############################################################
        ## update KMA database with NCBI information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_NCBI['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db)

        print('+ Database to update: ', genbank_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'genbank_KMA',
                                               genbank_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ###############
    ## user_data ##
    ###############
    if options.project_folder:

        ##
        dataBase_user = pd.DataFrame()
        ## get absolute path
        abs_project_folder = os.path.abspath(options.project_folder)
        if os.path.exists(abs_project_folder):
            #########
            if Debug:
                print(
                    colored("DEBUG: User provides folder containing project",
                            'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print("--------- Check user provided project folder ---------")
            HCGB_aes.print_sepLine("*", 70, False)
            dataBase_user = database_user.update_database_user_data(
                options.path, abs_project_folder, Debug, options)
        else:
            print(
                colored(
                    "ERROR: Folder provided does not exists: %s" %
                    options.project_folder, 'red'))
            exit()

        ##############################################################
        ## update KMA database with user_data information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_user['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        user_kma_db = HCGB_files.create_subfolder('user_data', kma_db)

        print('+ Database to update: ', user_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'userData_KMA',
                                               user_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ##########
    ## ARIBA
    ##########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check ARIBA parameters provided --------")
    HCGB_aes.print_sepLine("*", 50, False)
    if (options.no_ARIBA):
        print("+ No ARIBA databases would be downloaded...")

        #########
        if Debug:
            print(colored("DEBUG: No option ARIBA", 'yellow'))

    else:
        #functions.print_sepLine("*",50, False)

        ### ariba list databases
        ariba_dbs_list = ['CARD', 'VFDB']

        if (options.no_def_ARIBA):
            ariba_dbs_list = options.ariba_dbs
        else:
            if (options.ariba_dbs):
                ariba_dbs_list = ariba_dbs_list + options.ariba_dbs
                ariba_dbs_list = set(ariba_dbs_list)

        #########
        if Debug:
            print(colored("DEBUG: Option ARIBA", 'yellow'))
            print(options.ariba_dbs)

        ariba_caller.download_ariba_databases(ariba_dbs_list, options.path,
                                              Debug, options.threads)

        ### ariba list databases
        if (options.ariba_users_fasta):
            print(
                "+ Generate ARIBA database for databases provided: prepare fasta and metadata information"
            )

            #########
            if Debug:
                print(colored("DEBUG: Option user ARIBA db", 'yellow'))
                print(ariba_users_fasta)
                print(ariba_users_meta)

            ## [TODO]:
            ## ariba prepareref fasta and metadata

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    #########
    ## kma ##
    #########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check KMA parameters provided ----------")
    kma_database = options.path + '/KMA_db'
    HCGB_files.create_folder(kma_database)

    ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains
    ## downloads all "bacterial" genomes from KMA website
    ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/

    print(
        "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website"
    )

    ## KMA databases to use
    ## only user dbs
    if (options.no_def_kma):
        if (options.kma_dbs):
            print("+ Only user databases selected will be indexed...")
        else:
            print("+ No databases selected.")
            print(colored("ERROR: Please select a kma database.", 'red'))
            exit()

    ## default dbs + user
    else:
        kma_dbs = ["bacteria", "plasmids"]

        ## default dbs + user
        if (options.kma_dbs):
            options.kma_dbs = options.kma_dbs + kma_dbs
            options.kma_dbs = set(options.kma_dbs)
        else:
            options.kma_dbs = kma_dbs

    #########
    if Debug:
        print(colored("DEBUG: options.kma_dbs", 'yellow'))
        print(options.kma_dbs)

    ## Get databases
    for db in options.kma_dbs:
        print(colored("\n+ " + db, 'yellow'))
        db_folder = HCGB_files.create_subfolder(db, kma_database)
        species_identification_KMA.download_kma_database(db_folder, db, Debug)

    ### timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ###########
    ## BUSCO ##
    ###########
    if (options.BUSCO_dbs):
        print()
        HCGB_aes.print_sepLine("*", 50, False)
        print("--------- Check BUSCO datasets provided ---------")
        BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path)

        #########
        if Debug:
            print(colored("DEBUG: options.BUSCO_dbs", 'yellow'))
            print(options.BUSCO_dbs)

        print("+ BUSCO datasets would be downloaded when executed...")
        #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder)

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    print("\n*************** Finish *******************\n")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Database module.\n")
    return ()
def run_report(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################

    if (options.help_spaTyper):
        ## help_format option
        get_spa_typing.help_spaTyper()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    ## set default
    options.batch = False

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Report generation module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## call assemble using spades
    start_time_partial = start_time_total

    ## absolute path for in & out
    options.database = os.path.abspath(options.database)
    global input_dir
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    global Project
    if (options.detached):
        options.project = False
        outdir = os.path.abspath(options.output_folder)
        Project = False
    else:
        options.project = True
        outdir = input_dir
        Project = True

    ##
    print("\n+ Get project information:")

    ## get files: trimm, assembly, annotation
    pd_samples_retrieved = database_user.get_userData_files(options, input_dir)
    pd_samples_retrieved['new_name'] = pd_samples_retrieved['name']

    ## get info: profile, ident, cluster, MGE
    pd_samples_info = database_user.get_userData_info(options, input_dir)

    ## get databases to list
    #retrieve_databases = get_options_db(options)

    ## create output files
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "report",
                                            options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

        print(colored("**DEBUG: pd_samples_info **", 'yellow'))
        print(pd_samples_info)

    ## generate output folder, if necessary
    print(
        "\n\n\n+ Generate a report summarizing analysis and sample information"
    )
    if not options.project:
        HCGB_files.create_folder(outdir)
        outdir_report = outdir
    else:
        ### report generation
        outdir_report = HCGB_files.create_subfolder("report", outdir)

    ## create report with all data
    summary_report = HCGB_files.create_subfolder("summary_report",
                                                 outdir_report)
    print("Folder: ", summary_report)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ########################################
    ## create species specific report if any
    ########################################
    if (options.species_report):
        ## Saureus
        if options.species_report == "Saureus":
            Saureus_specific(pd_samples_retrieved, pd_samples_info, options,
                             summary_report, outdir_dict)

        ## else
        ## to add accordingly

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    ###########################################################
    ## create gene fasta sequences retrieval if desired
    ###########################################################
    if options.genes_ids_fasta:
        ## given a list of genes ids, retrieve sequence for all samples from profile
        if os.path.isfile(os.path.abspath(options.genes_ids_fasta)):
            in_file = os.path.abspath(options.genes_ids_fasta)
            gene_names = [line.rstrip('\n') for line in open(in_file)]
            print(
                '+ Retrieve selected genes sequences from the profile analysis for each sample.'
            )
            print('+ Searching gene:')

            ## get profiles available
            results_geneIDs = pd.DataFrame(columns=('sample', 'gene', 'id',
                                                    'sequence'))
            sample_frame = pd_samples_info.groupby(["name"])
            for g in gene_names:
                print("\t+", g)
                for name, cluster_df in sample_frame:
                    my_list_profiles = cluster_df.loc[
                        cluster_df['tag'] == 'profile']['ext'].to_list()
                    if options.debug:
                        print("name: ", name)
                        print("my_list_profiles:")
                        print(my_list_profiles)

                    for p in my_list_profiles:
                        main_profile_folder = cluster_df.loc[
                            cluster_df['ext'] == p]['dirname'].to_list()[0]
                        p = p.lower()
                        if p == 'vfdb':
                            p = p + '_full'

                        profile_folder = os.path.join(main_profile_folder, p)
                        (seq_id, seq_sequence
                         ) = retrieve_genes.retrieve_genes_ids_sequences(
                             profile_folder, g, Debug)
                        if (seq_id):
                            ## save results
                            results_geneIDs.loc[len(results_geneIDs)] = (
                                name, g, seq_id, seq_sequence)

        ## save for each gene in a separate fasta file
        list_of_genes = set(results_geneIDs['gene'].to_list())

        ## debug
        if Debug:
            print("** DEBUG **")
            print(results_geneIDs)
            print(list_of_genes)

        ## Save results
        genes_folder = HCGB_files.create_subfolder('genes', summary_report)
        for gene_retrieved in list_of_genes:
            this_frame = results_geneIDs[results_geneIDs['gene'] ==
                                         gene_retrieved]

            gene_retrieved_file = os.path.join(genes_folder, gene_retrieved)
            gene_retrieved_fasta = gene_retrieved_file + ".fasta"
            gene_retrieved_info = gene_retrieved_file + "_info.txt"
            fasta_hd = open(gene_retrieved_fasta, 'w')
            info_hd = open(gene_retrieved_info, 'w')

            for item, row in this_frame.iterrows():
                string2write = ">" + row['sample'] + '_' + row[
                    'gene'] + '\n' + row['sequence'] + '\n'
                string2write_info = row['sample'] + '\t' + row[
                    'gene'] + '\t' + row['id'] + '\n'
                fasta_hd.write(string2write)
                info_hd.write(string2write_info)

            fasta_hd.close()
            info_hd.close()

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

        ########################################
        ## create gene promoter fasta sequences retrieval if desired
        ########################################
        if options.promoter_bp:
            ## retrieve as many bp as necessary from genes_ids_fasta
            print("** THIS OPTION IS NOT IMPLEMENTED YET... **")
            #get_promoter.get_promoter(file, geneOfInterest, basePairs, sampleName, option, debug=False):

    ########################################
    ## create gene specific report if any
    ########################################
    if options.genes_ids_profile:
        if options.species_report == "Saureus":
            if Debug:
                print("** options.genes_ids_profile **")
                print("Analysis already done for Saureus")
        else:
            in_file = os.path.abspath(options.genes_ids_profile)
            gene_names = [line.rstrip('\n') for line in open(in_file)]
            results_Profiles = retrieve_genes.get_genes_profile(
                pd_samples_info, gene_names, options.debug, "name")
            if options.debug:
                print("results_Profiles")
                print(results_Profiles)

            ## open excel writer
            name_excel = summary_report + '/gene_ids_profile.xlsx'
            writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')
            results_Profiles.to_excel(writer, sheet_name="gene_ids")

            ## close
            writer.save()

            ## time stamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

    ###############################################
    ## Search for any additional fasta sequence
    ###############################################
    if options.genes_fasta:
        ## given a list of fasta sequences search using blast against proteins annotated or genome
        print("** THIS OPTION IS NOT IMPLEMENTED YET... **")

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Report generation module.")
    return ()
Exemple #12
0
def get_reference_gbk(options):

   ####################
    ## Genbank_ID
    ####################
    reference_gbk_file = ""
    if options.Genbank_ID:
        db_frame_ncbi = database_generator.getdbs('NCBI', options.database, 'genbank', options.debug)
    
        ## debug message
        if (options.debug):
            print (colored("**DEBUG: db_frame_ncbi **", 'yellow'))
            print (db_frame_ncbi) 

        NCBI_folder = HCGB_files.create_subfolder('NCBI', options.database)
        dir_path = os.path.join(NCBI_folder, 'genbank', 'bacteria', options.Genbank_ID)    
        if (options.Genbank_ID in db_frame_ncbi.index): 
            print('\t+ Reference (%s) available in database provided' %options.Genbank_ID)
        else:
            print ('\t+ Reference (%s) is not available in database provided' %options.Genbank_ID)
            print ('\t+ Try to download it.')
            database_generator.ngd_download(dir_path, options.Genbank_ID, NCBI_folder)
    
        ## get files download
        (genome, prot, gff, gbk) = database_generator.get_files_download(dir_path)
        if options.debug:
                print (colored("**DEBUG: genome:" + genome, 'yellow'))
                print (colored("**DEBUG: prot:" + prot, 'yellow'))
                print (colored("**DEBUG: gff:" + gff, 'yellow'))
                print (colored("**DEBUG: gbk:" + gbk, 'yellow'))
                
        if HCGB_files.is_non_zero_file(gbk):
            print('\t+ Genbank file format reference available.')
            reference_gbk_file = gbk
        else:
            print(colored('\n+ No genbank file available for the reference specified. Some error occurred while downloading', 'red'))
            exit()
            
    ####################          
    ## user_sample_ID
    ####################
    elif options.user_sample_ID:
        db_frame_user_Data = database_user.get_userData_files(options, os.path.join(options.database, 'user_data'))
        df_data = db_frame_user_Data.groupby('name')

        try:
            this_sample_df = df_data.get_group(options.project_sample_ID)
            print('\t+ Reference (%s) available in database folder provided' %options.user_sample_ID)
        except:
            print (colored('** WARNING: Reference (%s) not available in database folder provided' %options.user_sample_ID, 'yellow'))
            print ('\t+ Lets try to update the database first.')
            db_frame_user_dataUpdated = database_user.update_database_user_data(options.database, input_dir, options.debug, options)
            df_data = db_frame_user_dataUpdated.groupby('name')
 
            try:
                this_sample_df = df_data.get_group(options.user_sample_ID)
                print('\t+ Reference (%s) available in database updated' %options.user_sample_ID)
                db_frame_user_Data = db_frame_user_dataUpated

            except:
                print(colored('\n** ERROR: No reference (%s) available in database updated. Some error occurred...' %options.user_sample_ID, 'red'))
                exit()

         ## debug message
        if (options.debug):
            print (colored("**DEBUG: db_frame_user_Data **", 'yellow'))
            print (db_frame_user_Data)
            print (colored("**DEBUG: this_sample_df (groupby name)**", 'yellow'))
            print (this_sample_df)


       ## get gbk file
        gbk = this_sample_df.loc[ this_sample_df['ext']=='gbf','sample'].values[0]
        
        ## debug
        if options.debug:
            print ("** DEBUG: this_sample_df")
            print (this_sample_df)
            print ('gbk:' + gbk)
  
        ## check if exists
        if HCGB_files.is_non_zero_file(gbk):
            print('\t+ Genbank file format reference available.')
            reference_gbk_file = gbk
        else:
            print(colored('\n** ERROR: No genbank file available for the reference specified. Some error occurred while downloading', 'red'))
            exit()
        
    ####################    
    ## project_sample_ID
    ####################
    elif options.project_sample_ID:
        
        db_frame_project_Data = database_user.get_userData_files(options, options.input)
        df_data = db_frame_project_Data.groupby('name')

        try:
            this_sample_df = df_data.get_group(options.project_sample_ID)
            print('\t+ Reference (%s) available in project folder provided' %options.project_sample_ID)
        except:
            print (colored('** ERROR: Reference (%s) not available in project folder provided' %options.project_sample_ID, 'red'))
            print ('\t+ Check the spelling or provide a valid ID.')
            exit()
 
        ## debug message
        if (options.debug):
            print (colored("**DEBUG: db_frame_project_Data **", 'yellow'))
            print (db_frame_project_Data)
            print (colored("**DEBUG: this_sample_df (groupby name)**", 'yellow'))
            print (this_sample_df)

        ## get gbk file
        gbk = this_sample_df.loc[ this_sample_df['ext']=='gbf','sample'].values[0]

        ## debug
        if options.debug:
            print ("** DEBUG: this_sample_df")
            print (this_sample_df)
            print ('gbk:' + gbk)

        ## check if exists
        if HCGB_files.is_non_zero_file(gbk):
            print('\t+ Genbank file format reference available.')
            reference_gbk_file = gbk
        else:
            print(colored('\n** ERROR: No genbank file available for the reference specified. Some error occurred while downloading', 'red'))
            exit()

    ####################
    ## user_ref
    ####################
    elif options.user_ref:
        options.user_ref = os.path.abspath(options.user_ref)
        if HCGB_files.is_non_zero_file(options.user_ref):
            print('\t+ Reference provided via --user_ref is available and ready to use.')
        else:
            print('\n** ERROR: Reference provided via --user_ref not available or accessible.')
            print(colored('\n+ Check the path or integrity of the file. Some error occurred...', 'red'))
            exit()
        reference_gbk_file = options.user_ref

    
    return (reference_gbk_file)
Exemple #13
0
def run_phylo(options):
    """
    Main function acting as an entry point to the module *phylo*.
    """

    ##################################
    ### show help messages if desired    
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    
    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False
        
    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Phylogenetic reconstruction")

    print ("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir=""

    ## set mode: project/detached
    ## Project mode as default
    project_mode=True
    if (options.detached):
        options.project = False
        project_mode=False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir    
    
    ## get the database 
    options.database = os.path.abspath(options.database)
    
    ### parse the reference
    print ("+ Retrieve the reference...")
    reference_gbk_file = get_reference_gbk(options)
                 
    ## generate output folder, if necessary
    print ("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    
    ##################################
    ## select samples and map    
    ####################################
    print ("+ Retrieve samples to map available...")
    dict_folders = map_samples(options, reference_gbk_file, input_dir, outdir)
    
    if Debug:
        print (colored("**DEBUG: dict_folders **", 'yellow'))
        print (dict_folders)
    
    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ##################################
    ## Create core alingment
    ##################################
    outdir_report = HCGB_files.create_subfolder("report", outdir)
    phylo_dir = HCGB_files.create_subfolder("phylo", outdir_report)
    analysis_dir = HCGB_files.create_subfolder(options.name, phylo_dir)
    snippy_dir = HCGB_files.create_subfolder("snippy", analysis_dir)
        
    list_folders = list(dict_folders.values())
    options_string = ""
    variant_calling.snippy_core_call(list_folders, options_string, options.name, 
                                     snippy_dir, options.output_format, Debug)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## snp distance matrix
    snp_distance_dir = HCGB_files.create_subfolder("snp_distance", analysis_dir)
    name_matrix = os.path.join(snp_distance_dir, "snp_matrix_" + options.name)
    
    countGaps = False
    aln_file = os.path.join(snippy_dir, options.name + '.aln')
    phylo_parser.get_snp_distance(aln_file, options.output_format, countGaps, name_matrix, Debug)
    
    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## phylogenetic analysis
    iqtree_output = HCGB_files.create_subfolder("iqtree", analysis_dir)
    phylo_parser.ml_tree(snippy_dir, options.name, options.threads, iqtree_output, Debug)
    
    ## time stamp
    start_time_partial = HCGB_files.timestamp(start_time_total)

    print ("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print ("+ Exiting Annotation module.")
    return()
Exemple #14
0
def mapReads_module(options, pd_samples_retrieved, outdir_dict, Debug,
                    max_workers_int, threads_job, start_time_partial, outdir):

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["new_name"])

    ## options
    STAR_exe = set_config.get_exe("STAR", Debug=Debug)
    cwd_folder = os.path.abspath("./")
    folder = files_functions.create_subfolder('STAR_files', cwd_folder)

    ## For many samples it will have to load genome index in memory every time.
    ## For a unique sample it will not matter. Take care genome might stay in memory.
    ## Use before loop option LoadAndExit and then:
    ## in loop
    ## Use option LoadAndKeep, set shared memory > 30 Gb
    ## when finished loop Remove memory

    ## check reference
    if (options.fasta):
        print("+ Genome fasta file provided")
        print("+ Create genomeDir for later usage...")
        options.fasta = os.path.abspath(options.fasta)

        ## create genomeDir
        options.genomeDir = mapReads.create_genomeDir(folder, STAR_exe,
                                                      options.threads,
                                                      options.fasta,
                                                      options.limitRAM)

    elif (options.genomeDir):
        print("+ genomeDir provided.")
        options.genomeDir = os.path.abspath(options.genomeDir)

    ## remove previous reference genome from memory
    print("+ Remove genome in memory from previous call... (if any)")
    mapReads.remove_Genome(STAR_exe, options.genomeDir, folder,
                           options.threads)

    ## load reference genome
    mapReads.load_Genome(folder, STAR_exe, options.genomeDir, options.threads)

    ## functions.time_functions.timestamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    print("+ Mapping sequencing reads for each sample retrieved...")

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(mapReads_caller,
                            sorted(cluster["sample"].tolist()),
                            outdir_dict[name], name, threads_job, STAR_exe,
                            options.genomeDir, options.limitRAM, Debug): name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    print("\n\n+ Mapping reads has finished...")

    ## functions.time_functions.timestamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    ## remove reference genome from memory
    mapReads.remove_Genome(STAR_exe, options.genomeDir, folder,
                           options.threads)

    ## functions.time_functions.timestamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print("\n+ Generating a report using MultiQC module.")
        outdir_report = files_functions.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders:"
        )

        ## call multiQC report module
        givenList = [v for v in outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            print(
                colored("\n**DEBUG: my_outdir_list for multiqc report **",
                        'yellow'))
            print(my_outdir_list)
            print("\n")

        map_report = files_functions.create_subfolder("STAR", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "STAR", map_report,
                                           "-dd 2")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % map_report)

    return (start_time_partial)
def discardPlasmids(contigs, plasmids, path, sample):
	
	## check if any plasmids
	if (plasmids == 'FAIL'):
		#print ('+ No plasmids assembled.')
		#print ('+ No need to discard any plasmids from the main assembly')
		
		contig_out_file = os.path.dirname(path) + '/' + sample + '/' + sample + '_chromosome.fna.tmp'
		shutil.copy(contigs, contig_out_file)
		return (contig_out_file, plasmids)
	
	## discard 
	print ('+ Check if any plasmids are also reported in main assembly...')

	folder = HCGB_files.create_subfolder('blast_search', path)	
	
	## makeblastDB
	dbName = folder + '/mainAssembly'
	HCGB_.makeblastdb(dbName, contigs)
	
	## blastn command
	outFile = folder + '/blastn_output.txt'
	threads = 1
	HCGB_blast.blastn(outFile, dbName, plasmids, threads)
	
	########################
	## parseBlast results
	########################
	
	## thresholds
	eval_thresh_float = float(1e-20)
	aln_thresh_given = 90
	min_length = 1000
	
	outFile_parsed = folder + '/blastn_output_parsed.txt'
	output_file = open(outFile_parsed, 'w')	
	sequences2discard = []

	print ('+ Parsing BLAST results generated...\n')
	## get results
	fh = open(outFile)
	for blast_record in HCGB_blast.parse(fh, eval_thresh=eval_thresh_float, aln_thresh=aln_thresh_given, length_thresh=min_length):
		for hit in blast_record.hits:
			for hsp in hit:
				output_file.write('****Alignment****')
				output_file.write('\n')
				
				output_file.write('query id: {}'.format(blast_record.qid))
				output_file.write('\n')
				
				sequences2discard.append(hsp.sid)
				output_file.write('sequence: %s' %hsp.sid)
				output_file.write('\n')
				
				output_file.write('e value: %s' %hsp.evalue)
				output_file.write('\n')
				
				output_file.write('aln: %s' %hsp.length)
				output_file.write('\n')
				
				output_file.write('qlen: %s [>%s]' %(hsp.qlen, min_length))
				output_file.write('\n')
				
				aln = (int(hsp.qlen)/int(hsp.slen))*100
				output_file.write('aln/slen: %s [> %s]' %(aln, aln_thresh_given))
				output_file.write('\n\n')

	fh.close()
	output_file.close()
	
	items = len(sequences2discard)
	print ('There are %s sequences to discard from main assembly identified as plasmids' %items)

	## print filtered contigs
	contig_out_file = os.path.dirname(path) + '/' + sample + '/' + sample + '_chromosome.fna.tmp'
	plasmid_out_file = os.path.dirname(path) + '/' + sample + '/' + sample + '_plasmid.fna.tmp'
		
	contig_out_file_handle = open(contig_out_file, 'w')
	plasmid_out_file_handle = open(plasmid_out_file, 'w')
	
	contig_items = SeqIO.parse(contigs, 'fasta')
	for seq in contig_items:
		if seq.id in sequences2discard:
			plasmid_out_file_handle.write(seq.format("fasta"))
			plasmid_out_file_handle.write('\n')
		else:
			contig_out_file_handle.write(seq.format("fasta"))
			contig_out_file_handle.write('\n')

	contig_out_file_handle.close()
	plasmid_out_file_handle.close()	
	
	return (contig_out_file, plasmid_out_file)
Exemple #16
0
def parse_options(arg_dict):

    outdir = os.path.abspath(arg_dict.output_folder)

    ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options
    ## ATTENTION: df_accID merge generated dataframe

    ## --------------------------------------- ##
    ## GFF or GBF file
    ## --------------------------------------- ##
    if (arg_dict.annot_file):
        arg_dict.annot_file = os.path.abspath(arg_dict.annot_file)

        # *************************** ##
        ## multiple files provided
        # *************************** ##
        if (arg_dict.batch):
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple annotation file provided option:',
                              'yellow')
                debug_message('arg_dict.annot_file: ' + arg_dict.annot_file,
                              'yellow')

            ## check if ok
            BacDup_functions.file_readable_check(arg_dict.annot_file)

            print(
                colored('\t* Multiple annotation files provided .......[OK]',
                        'green'))
            dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',')

            ## debug messages
            if (arg_dict.debug):
                debug_message('dict_entries: ', 'yellow')
                debug_message(dict_entries, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++\n\n')

        # *************************** ##
        ## single file provided
        # *************************** ##
        else:
            dict_entries = {}
            print(colored('\t* Annotation file:.......[OK]', 'green'))
            if (arg_dict.sample_name):
                sample_name = arg_dict.sample_name
            else:
                sample_name = "sample"

            ##
            dict_entries[sample_name] = arg_dict.annot_file

        ## create dataframe df_accID to match other formats
        df_accID = pd.DataFrame(
            columns=(BacDup_functions.columns_accID_table()))

        for name, file_annot in dict_entries.items():
            file_annot = os.path.abspath(file_annot)

            ## init all
            genome = ""
            prot = ""
            gff = ""
            gbk = ""
            plasmid_count = ""
            plasmid_id = ""

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message(
                    'dict_entries check annotation files provided option:',
                    'yellow')
                debug_message('name: ' + name, 'yellow')
                debug_message('file_annot: ' + file_annot, 'yellow')

            ## check file is valid
            BacDup_functions.file_readable_check(file_annot)

            ## get format
            format = format_checker.is_format(file_annot, arg_dict.debug)

            if (arg_dict.debug):
                debug_message('format: ' + format, 'yellow')

            ## parse accordingly
            taxonomy = ""
            organism = ""
            taxonomy_string = ""
            genus = ""
            if (format == 'gbk'):
                ## get information from each sample
                (taxonomy,
                 organism) = BacDup.scripts.functions.get_gbk_information(
                     file_annot, arg_dict.debug)
                ## plasmid_count, plasmid_id not available

            elif (format == 'gff'):
                if (arg_dict.ref_file):
                    arg_dict.ref_file = os.path.abspath(arg_dict.ref_file)
                    BacDup_functions.file_readable_check(arg_dict.ref_file)

                    if (arg_dict.batch):
                        ref_entries = HCGB_main.file2dictionary(
                            arg_dict.ref_file, ',')
                        genome = ref_entries[name]
                    else:
                        genome = arg_dict.ref_file

            ## save into dataframe
            if len(taxonomy) > 1:
                genus = taxonomy[-1]
                taxonomy_string = ";".join(taxonomy)

            dir_path = os.path.abspath(os.path.dirname(file_annot))
            df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism,
                                           taxonomy_string, genome, file_annot,
                                           format, prot, plasmid_count,
                                           ";".join(plasmid_id))

    ## --------------------------------------- ##
    ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1
    ## --------------------------------------- ##
    elif (arg_dict.GenBank_id):
        ## get database path
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder(
                "db", os.path.abspath(arg_dict.output_folder))

        ## debug messages
        if (arg_dict.debug):
            debug_message('+++++++++++++++++++++++++++++++')
            debug_message('GenBank ID option:', 'yellow')
            debug_message('db_folder: ' + db_folder, 'yellow')

        # *************************** ##
        ## batch file
        # *************************** ##
        if (arg_dict.batch):
            arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id)

            ## debug messages
            if (arg_dict.debug):
                debug_message('GenBank ID batch file provided:', 'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.GenBank_id)

            print(
                colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]',
                        'green'))
            print()

            ## call IDs into a list and create tmp folder
            strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id)
            strains2get = list(filter(None, strains2get))

            ## debug messages
            if (arg_dict.debug):
                debug_message('strains2get: ' + str(strains2get), 'yellow')

            ## call NCBI_downloader
            df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
                strains2get, db_folder, arg_dict.debug,
                arg_dict.assembly_level)

        # *************************** ##
        ## single GenBank ID
        # *************************** ##
        else:
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Single NCBI GenBank IDs provided option:',
                              'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')
                debug_message('db_folder: ' + db_folder, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++')

            ## download
            print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green'))
            print()
            HCGB_aes.print_sepLine("+", 75, False)
            df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload(
                arg_dict.GenBank_id, db_folder, arg_dict.debug)

    ## --------------------------------------- ##
    ## NCBI Taxonomy ID:
    ## --------------------------------------- ##
    elif (arg_dict.tax_id):
        #################
        ## get tax ids
        #################
        if (arg_dict.batch):
            print(
                colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]',
                        'green'))

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple NCBI Taxonomy IDs provided option:',
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.tax_id)

            ## get IDs into a list
            taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id)

        else:
            print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green'))
            taxIDs2get = [arg_dict.tax_id]

        print()

        ##################################
        ## init ete NCBI taxonomy database
        ##################################
        print('+ Initiate NCBI taxonomy database...')
        ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug)

        string_info_total = []
        for taxid in taxIDs2get:
            ## parse
            info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel',
                                                  arg_dict.debug)
            print()

            ## debug messages
            if arg_dict.debug:
                debug_message(
                    "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                )
                debug_message('info\n', "yellow")
                print(info)

            ## append if more
            string_info_total.extend(info)

        ## convert to list of strings
        string_info_total = [str(int) for int in string_info_total]

        ## assume all belong to same superkingdom if children of same tax_id
        group_obtained = taxonomy_retrieval.get_superKingdom(
            string_info_total[0], ncbi, arg_dict.debug)

        #################
        ## get database path
        #################
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder("db", outdir)

        ## debug messages
        if arg_dict.debug:
            debug_message(
                "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            )
            debug_message('group_obtained: ' + group_obtained, "yellow")
            debug_message('db_folder: ' + db_folder, "yellow")
            debug_message(
                'arg_dict.assembly_level: ' + arg_dict.assembly_level,
                "yellow")
            debug_message('arg_dict.section: ' + arg_dict.section, "yellow")

        ##################################
        ## get GenBank entries selected
        ##################################
        (strains2get,
         allstrains_available) = taxonomy_retrieval.get_GenBank_ids(
             db_folder,
             string_info_total,
             int(arg_dict.k_random),
             arg_dict.debug,
             assembly_level_given=arg_dict.assembly_level,
             group_given=group_obtained,
             section_given=arg_dict.section)

        ## print list and dictionary of possible and selected taxIDs
        outdir = os.path.abspath(arg_dict.output_folder)
        info_dir = HCGB_files.create_subfolder("info", outdir)
        input_info_dir = HCGB_files.create_subfolder("input", info_dir)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'all_entries.txt'),
            allstrains_available)

        ## save into file
        file_info = os.path.join(input_info_dir, 'info.txt')

        ## stop here if dry_run
        if arg_dict.dry_run:
            print()
            HCGB_aes.print_sepLine("*", 75, False)
            print(
                "ATTENTION: Dry run mode selected. Stopping the process here.")
            HCGB_aes.print_sepLine("*", 75, False)
            print("+ All available entries listed and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'all_entries.txt'))
            print("+ Subset of entries generated and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'Downloaded.txt'))
            print(
                "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n"
            )
            HCGB_aes.print_sepLine("*", 75, False)
            print()
            exit()

        #################
        ## call NCBI_downloader
        #################
        df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
            strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level)

    ## --------------------------------------- ##
    ## Previous BacDup analysis folder
    ## --------------------------------------- ##
    ## TODO
    elif (arg_dict.project):
        print(
            colored(
                '\t* A previous BacDup analysis project folder:.......[OK]',
                'green'))
        ## create df_accID to store data
        ## TODO

    ## Returns dataframe with information

    df_accID = df_accID.set_index('new_name')
    return (df_accID)
Exemple #17
0
def run_annotation(options):

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_BUSCO):
        ## information for BUSCO
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()

    elif (options.help_Prokka):
        ## information for Prokka
        annotation.print_list_prokka()
        exit()

    ## set default
    options.batch = False

    ###
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Assembly annotation")

    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ### symbolic links
    print("+ Retrieve all genomes assembled...")

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)

    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "annot",
                                            options.debug)

    ## annotate
    print("+ Annotate assemblies using prokka:")
    print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode")
    if options.genera == 'Other':
        print(
            "\t-Option: genera = Off; No genus-specific BLAST databases option provided"
        )
    else:
        print("\t-Option: genera = ", options.genera,
              "; Genus-specific BLAST databases option provided")

    print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature")
    print("\t-Option: addmrna;  Add 'mRNA' features for each 'CDS' feature")
    print("\t-Option: cdsrnaolap;  Allow [tr]RNA to overlap CDS")

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(annot_caller, row['sample'],
                            outdir_dict[row['name']], options, row['name'],
                            threads_job): index
            for index, row in pd_samples_retrieved.iterrows()
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get folders
    givenList = [v for v in outdir_dict.values()]
    protein_files = []
    print(
        "+ Detail information for each sample could be identified in separate folders:"
    )
    for folder in givenList:
        print('\t + ', folder)
        protein_files.extend(
            HCGB_main.retrieve_matching_files(folder, '.faa', Debug))

    ### report generation
    if (options.skip_report):
        print("+ No annotation report generation...")
    else:
        ### report generation
        HCGB_aes.boxymcboxface("Annotation report")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        PROKKA_report = HCGB_files.create_subfolder("annotation",
                                                    outdir_report)
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % PROKKA_report)

        ## check if previously report generated
        filename_stamp = PROKKA_report + '/.success'
        done = 0
        if os.path.isdir(PROKKA_report):
            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous report generated results on: %s" % stamp,
                        'yellow'))
                done = 1

        ## generate report
        if done == 0:
            ## get subdirs generated and call multiQC report module
            multiQC_report.multiQC_module_call(givenList, "Prokka",
                                               PROKKA_report, "-dd 2")
            print(
                '\n+ A summary HTML report of each sample is generated in folder: %s'
                % PROKKA_report)

            ## success stamps
            filename_stamp = PROKKA_report + '/.success'
            stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## time stamp
    start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total)

    ## Check each annotation using BUSCO
    results = qc.BUSCO_check(input_dir, outdir, options,
                             start_time_partial_BUSCO, "proteins")

    ## print to file: results

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Annotation module.")
    return ()
def getdbs(source, database_folder, option, debug):
    """Get databases available within the folder provided.
	
	:param source: Type of database to search: ARIBA, KMA, NCBI, MLST, user_data
	:param database_folder: Absolute path to database folder.
	:param option: String containing multiple entries separated by '#' that indicate the type of database entries to search within each source type.
	:param debug: True/False for debugging messages.
	
	:type source: string
	:type database_folder: string
	:type option: string
	:type debug: bool
	
	:returns: Dataframe containing absolute paths to the available databases for each type requested. It contains columns for: "source", "db", "path"
		
	e.g.: 	source = KMA
			option = kma:archaea,plasmids,bacteria#kma_external:/path/to/file1,/path/to/file2#user_data#genbank **
			
	e.g.: 	source = NCBI
			option = genbank
	
	"""

    ## init dataframe
    colname = ["source", "db", "path"]
    db_Dataframe = pd.DataFrame(columns=colname)

    ## read folders within database
    if os.path.isdir(database_folder):
        files = os.listdir(database_folder)  ## ARIBA/KMA_db/genbank/user_data
    else:
        return db_Dataframe

    ## debug message
    if (debug):
        print(colored("Folders: " + str(files), 'yellow'))
        print()

    ## user input
    dbs2use = []
    option_list = option.split("#")

    for option_item in option_list:

        ## debug message
        if (debug):
            print(colored("Option item: " + option_item, 'yellow'))

        ###
        dbs2use_tmp = []

        ## kma
        if (option_item.startswith('kma')):
            if (option_item.startswith('kma:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

            elif (option_item.startswith('kma_external:')):
                external = option_item.split(":")[1].split(",")

                ## add to dataframe
                for ext in external:
                    name_ext = os.path.basename(ext)
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_External', name_ext, ext
                    ]

            elif (option_item.startswith('kma_user_data:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

            elif (option_item.startswith('kma_NCBI:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

        ### ARIBA
        elif (option_item.startswith('ARIBA:')):
            dbs2use = option_item.split(":")[1].split(",")

        ### NCBI: genbank
        elif (option_item.startswith('genbank')):
            dbs2use.append('genbank')

        ### NCBI: taxonomy ID
        elif (option_item.startswith('tax_id')):
            dbs2use.append('taxonomy_id')

        ### user_data
        elif (option_item.startswith('user_data')):
            dbs2use.append('user_data')

        ### MLST
        elif (option_item.startswith('MLST')):
            dbs2use_tmp = option_item.split(":")[1].split(",")

        ### Mash
        elif (option_item.startswith('Mash')):
            if (option_item.startswith('Mash_external_data:')):
                external = option_item.split(":")[1].split(",")
                ## add to dataframe
                for ext in external:
                    name_ext = os.path.basename(ext)
                    name_ext_ = name_ext.split('.fna')[0]
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'Mash_external', name_ext_, ext
                    ]
            else:
                dbs2use_tmp = option_item.split(":")[1].split(",")

        ### Other?
        else:
            dbs2use.append(
                option_item
            )  ## add ARIBA, user_data or genbank option if provided

        ## get all
        dbs2use = dbs2use + dbs2use_tmp

    ## debug message
    if (debug):
        print(colored("\ndbs2use:\n\t" + "\n\t".join(dbs2use), 'yellow'))

    ## init dataframe
    #colname = ["source", "db", "path"]
    #db_Dataframe  = pd.DataFrame(columns = colname)

    ###############
    #### ARIBA ####
    ###############
    if (source == 'ARIBA'):
        ### Check if folder exists
        ARIBA_folder = HCGB_files.create_subfolder('ARIBA', database_folder)

        ### get information
        ARIBA_dbs = ariba_caller.get_ARIBA_dbs(dbs2use)  ## get names
        for ariba_db in ARIBA_dbs:
            this_db = os.path.join(ARIBA_folder, ariba_db + '_prepareref')
            if os.path.exists(this_db):
                code_check_db = ariba_caller.check_db_indexed(this_db, 'NO')
                if (code_check_db == True):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'ARIBA', ariba_db, this_db
                    ]
                    print(
                        colored(
                            "\t- ARIBA: including information from database: "
                            + ariba_db, 'green'))
            else:
                print("+ Database: ", ariba_db, " is not downloaded...")
                print("+ Download now:")
                folder_db = HCGB_files.create_subfolder(ariba_db, ARIBA_folder)
                code_db = ariba_caller.ariba_getref(ariba_db, folder_db, debug,
                                                    2)  ## get names
                if (code_db == 'OK'):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'ARIBA', ariba_db, this_db
                    ]
                    print(
                        colored(
                            "\t- ARIBA: including information from database: "
                            + ariba_db, 'green'))

    #############
    #### KMA ####
    #############
    elif (source == 'KMA'):
        ### Check if folder exists
        KMA_db_abs = HCGB_files.create_subfolder('KMA_db', database_folder)
        kma_dbs = os.listdir(KMA_db_abs)

        ## debug message
        if (debug):
            print(colored("Folders KMA_db:" + str(kma_dbs), 'yellow'))

        ### get information
        for db in dbs2use:
            this_db = KMA_db_abs + '/' + db

            ## debug message
            if (debug):
                print(colored("this_db:" + this_db, 'yellow'))

            #### genbank
            if (db == "genbank"):
                ## KMA databases exists
                this_db_file = this_db + '/genbank_KMA'
                if os.path.isfile(this_db_file + '.comp.b'):
                    print(
                        colored(
                            "\t- genbank: including information from different reference strains available.",
                            'green'))  ## include data from NCBI
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_genbank', 'genbank', this_db_file
                    ]

            #### user_data
            elif (db == "user_data"):
                ## KMA databases exists
                this_db_file = this_db + '/userData_KMA'
                if os.path.isfile(this_db_file + '.comp.b'):
                    print(
                        colored(
                            "\t- user_data: including information from user previously generated results",
                            'green'))  ## include user data
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_user_data', 'user_data', this_db_file
                    ]

            ## default KMA databases: bacteria & plasmids
            else:
                ##
                if (db == 'plasmids'):
                    prefix = '.T'
                elif (db == 'viral'):
                    prefix = '.TG'
                else:
                    prefix = '.ATG'

                this_db_file = os.path.join(this_db, db, db + prefix)
                ## debug message
                if (debug):
                    print(colored("this_db_file:" + this_db_file, 'yellow'))

                if os.path.isfile(this_db_file + '.comp.b'):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_db', db, this_db_file
                    ]
                    print(
                        colored(
                            "\t- KMA: including information from database " +
                            db, 'green'))
                else:
                    print(
                        colored("\t**KMA: Database %s was not available." % db,
                                'red'))

                    ## if missing: call download module
                    print("+ Download missing KMA_db (%s) provided" % db)
                    species_identification_KMA.download_kma_database(
                        os.path.join(database_folder, 'KMA_db', db), db, debug)

                    if os.path.isfile(this_db_file + '.comp.b'):
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'KMA_db', db, this_db_file
                        ]
                        print(
                            colored(
                                "\t- KMA: including information from database "
                                + db, 'green'))
                    else:
                        print(
                            colored(
                                "\t**KMA: Database %s was not available." % db,
                                'red'))

    ##############
    #### NCBI ####
    ##############
    elif (source == 'NCBI'):

        ## TODO: get additional information from
        ## info_file = dir_path + '/info.txt'

        ### Check if folder exists
        path_genbank = os.path.join(database_folder, source, 'genbank')
        db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder)

        ### genbank entries downloaded
        if dbs2use[0] == 'genbank':
            ##
            if os.path.exists(path_genbank + '/bacteria'):
                genbank_entries = os.listdir(
                    os.path.join(path_genbank, 'bacteria'))
                for entry in genbank_entries:
                    this_db = os.path.join(path_genbank, 'bacteria', entry)
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'NCBI:genbank', entry, this_db
                    ]

        elif dbs2use[0] == 'tax_id':
            tax_id_entries = db2use_abs

    ###################
    #### user_data ####
    ###################
    elif (source == 'user_data'):
        ### Check if folder exists
        db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder)

        user_entries = os.listdir(db2use_abs)
        for entry in user_entries:
            this_db = db2use_abs + '/' + entry
            db_Dataframe.loc[len(db_Dataframe)] = ['user_data', entry, this_db]

    #################
    #### PubMLST ####
    #################
    elif (source == 'MLST'):
        ### get information
        for db in dbs2use:
            if db == 'PubMLST':
                ### Check if folder exists
                db2use_abs = HCGB_files.create_subfolder(
                    'PubMLST', database_folder)
                list_profiles = os.listdir(db2use_abs)

                for entry in list_profiles:
                    this_db = db2use_abs + '/' + entry
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'MLST', 'PubMLST', entry + ',' + this_db
                    ]
                    print(
                        colored(
                            "\t- MLST: including information from profile: " +
                            entry, 'green'))

            else:
                db_Dataframe.loc[len(db_Dataframe)] = [
                    'MLST', 'user_profile', db
                ]
                print(
                    colored(
                        "\t- MLST: including information from profile provided by user: "******"genbank"):

                ### Check if folder exists
                db2use_abs = database_folder + '/NCBI/genbank/bacteria'
                if os.path.exists(db2use_abs):
                    print(
                        colored(
                            "\n\t- genbank: including information from different reference strains available.",
                            'green'))  ## include data from NCBI
                    genbank_entries = os.listdir(db2use_abs)
                    for entry in genbank_entries:
                        print('\t+ Reading information from sample: ', entry)
                        this_db = db2use_abs + '/' + entry

                        ## get additional information from
                        info_file = this_db + '/info.txt'
                        info_data = pd.read_csv(info_file).set_index('ID')

                        info_data.fillna("NaN", inplace=True)

                        ## get readable name for each strain
                        entry_strain = str(info_data.loc[entry]['name'])

                        if entry_strain == 'NaN':  ## TODO: debug if it works
                            entry_strain = entry
                            print()
                        else:
                            print('\t\t+ Rename into: ', entry_strain)

                        list_msh = HCGB_main.retrieve_matching_files(
                            this_db, '.sig', debug)
                        if (list_msh):
                            ## print original in file
                            file2print = this_db + '/.original'
                            if not os.path.exists(file2print):
                                original = ['NaN']
                            else:
                                original = HCGB_main.readList_fromFile(
                                    file2print)

                            db_Dataframe.loc[len(db_Dataframe)] = [
                                'genbank', entry_strain, list_msh[0],
                                this_db + '/mash/' + original[0], original[1],
                                original[2], this_db
                            ]
                        else:
                            ## index assembly or reads...
                            list_fna = HCGB_main.retrieve_matching_files(
                                this_db, 'genomic.fna', debug)

                            ## not available
                            db_Dataframe.loc[len(db_Dataframe)] = [
                                'genbank', entry_strain, 'NaN', list_fna[0],
                                'NaN', 'NaN', this_db
                            ]

            #### user_data
            elif (db == "user_data"):
                print(
                    colored(
                        "\n\t- user_data: including information from user previously generated results",
                        'green'))  ## include user data
                db2use_abs = HCGB_files.create_subfolder(
                    'user_data', database_folder)
                user_entries = os.listdir(db2use_abs)
                for entry in user_entries:
                    if entry == 'user_database.csv':
                        continue

                    print('\t+ Reading information from sample: ', entry)
                    this_db = db2use_abs + '/' + entry
                    this_mash_db = this_db + '/mash/' + entry + '.sig'
                    if os.path.exists(this_mash_db):
                        ## print original in file
                        file2print = this_db + '/mash/.original'
                        if not os.path.exists(file2print):
                            original = ['NaN', 'NaN', 'NaN']
                        else:
                            original = HCGB_main.readList_fromFile(file2print)

                        ##
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'user_data', entry, this_mash_db,
                            this_db + '/mash/' + original[0], original[1],
                            original[2], this_db + '/mash'
                        ]
                    else:
                        ## not available
                        list_fna = HCGB_main.retrieve_matching_files(
                            this_db + '/assembly', '.fna', debug)
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'user_data', entry, 'NaN', list_fna[0], 'NaN',
                            'NaN', this_db + '/mash'
                        ]

    #### external_data
    ### TODO: Fix this
    mash_bin = ""  #set_config.get_exe('mash')
    if any(name in 'Mash_external'
           for name in db_Dataframe['source'].to_list()):
        print(
            colored(
                "\t- external_data: including information from external data provided by user",
                'green'))  ## include user data
        db_Dataframe = db_Dataframe.set_index("db", drop=False)
        frame = db_Dataframe[db_Dataframe['source'] == 'Mash_external']
        for index, row in frame.iterrows():
            print('\t+ Reading information for file: ', row['db'])
            outfile = row['path'] + '.msh'
            if not os.path.exists(outfile):
                path_file = os.path.dirname(row['path'])
                this_db_file = min_hash_caller.sketch_database([row['path']],
                                                               mash_bin,
                                                               row['path'],
                                                               row['db'],
                                                               path_file)
                HCGB_aes.print_sepLine("*", 50, False)

            db_Dataframe.loc[row['db']] = [
                'Mash_external', row['db'], outfile, row['path']
            ]

    ## index by id
    db_Dataframe = db_Dataframe.set_index("db", drop=False)
    return (db_Dataframe)
Exemple #19
0
def send_kma_job(outdir_file, list_files, name, database, threads, Debug):
    """
	Executes KMA identification jobs
	
	This function automates the process of checking if any previous run succeeded or
	runs the appropiate identification process for the sample and database provided.
	
	:param outdir_file:
	:param list_files:
	:param name:
	:param database:
	:param threads:
	:param dataFrame_sample:
	
	:type outdir_file:
	:type list_files:
	:type name:
	:type database:
	:type threads:
	:type dataFrame_sample:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.config.set_config.get_exe`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.kma_ident_call`
	
		- :func:`BacterialTyper.module.ident.get_outfile`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
		
		
	"""

    if (Debug):
        print(colored("**DEBUG: ident.send_kma_job call**", 'yellow'))
        print("outdir_file")
        print(outdir_file)
        print("list_files")
        print(list_files)
        print("name: " + name)
        print("database: " + database)

    ## outdir_KMA
    outdir_dict_kma = HCGB_files.create_subfolder("kma", outdir_file)

    ## set defaults
    kma_bin = set_config.get_exe("kma")

    ## get outfile
    outfile = get_outfile(outdir_dict_kma, name, database)

    ## check if previously run and succeeded
    basename_tag = os.path.basename(outfile)
    filename_stamp = outdir_dict_kma + '/.success_' + basename_tag

    if (Debug):
        print("Outdir: ", outdir_dict_kma)
        print("outfile: ", outfile)
        print("Filename_stamp: ", filename_stamp)

    if os.path.isfile(filename_stamp):
        stamp = HCGB_time.read_time_stamp(filename_stamp)
        print(
            colored(
                "\tA previous command generated results on: %s [%s]" %
                (stamp, name), 'yellow'))
    else:
        ## debug message
        if (Debug):
            print(
                colored(
                    "**DEBUG: species_identification_KMA.kma_ident_module call**",
                    'yellow'))
            print("outfile = get_outfile(outdir_dict_kma, name, db2use)")
            print("outfile: ", outfile)
            print(
                "species_identification_KMA.kma_ident_module(outfile, list_files, name, database, threads) "
            )
            print("species_identification_KMA.kma_ident_module" + "\t" +
                  outfile + "\t" + str(list_files) + "\t" + name + "\t" +
                  database + "\t" + str(threads) + "\n")

        ## Sparse or not
        #if any(name in basename_tag for name in ['userData_KMA', 'genbank_KMA']):
#		if (basename_tag == 'userData_KMA'):
#			option = ''
#		else:
#			option = '-Sparse '

## Add option to retrieve databse from memory
        option = ""
        option = option + '-shm 1'

        # Call KMA
        species_identification_KMA.kma_ident_call(outfile, list_files, name,
                                                  database, kma_bin, option,
                                                  threads)
        stamp = HCGB_time.print_time_stamp(filename_stamp)
Exemple #20
0
def run_ident(options):
    """
	Main function acting as an entry point to the module *ident*.
	
	Arguments:
	
	
	.. seealso:: Additional information to PubMLST available datasets.
	
		- :doc:`PubMLST datasets<../../../data/PubMLST_datasets>`
	
	
	"""

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    elif (options.help_KMA):
        ## information for KMA Software
        species_identification_KMA.help_kma_database()
        exit()

    elif (options.help_MLSTar):
        ## information for KMA Software
        MLSTar.help_MLSTar()
        exit()

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    ### species_identification_KMA -> most similar taxa
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Species identification")

    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    global Project

    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
        Project = False
    else:
        options.project = True
        outdir = input_dir
        Project = True

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "trim", ['_trim'], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    ## for each sample
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "ident",
                                            options.debug)

    ## let's start the process
    print(
        "+ Generate an species typification for each sample retrieved using:")
    print("(1) Kmer alignment (KMA) software.")
    print("(2) Pre-defined databases by KMA or user-defined databases.")

    ## get databases to check
    retrieve_databases = get_options_db(options)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: retrieve_database **", 'yellow'))
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)
        print(retrieve_databases)

    ######## KMA identification
    dataFrame_kma = KMA_ident(options, pd_samples_retrieved, outdir_dict,
                              retrieve_databases, start_time_partial)

    ## functions.timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: retrieve results to summarize **", 'yellow'))
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)
        print("dataframe_kma")
        print(dataFrame_kma)

    ## exit if viral search
    skip = False
    if (len(options.kma_dbs) == 1):
        for i in options.kma_dbs:
            if (i == 'viral'):
                print()
                MLST_results = ''
                options.fast = True
                skip = True

            ## what if only plasmids?

    ## do edirect and MLST if bacteria
    if (not skip):
        dataFrame_edirect = pd.DataFrame()

        ######## EDirect identification
        #dataFrame_edirect = edirect_ident(dataFrame_kma, outdir_dict, Debug)

        ## functions.timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

        ## debug message
        if (Debug):
            print(colored("**DEBUG: retrieve results from NCBI **", 'yellow'))
            pd.set_option('display.max_colwidth', None)
            pd.set_option('display.max_columns', None)
            print("dataFrame_edirect")
            print(dataFrame_edirect)

        ######## MLST identification
        MLST_results = MLST_ident(options, dataFrame_kma, outdir_dict,
                                  dataFrame_edirect, retrieve_databases)

        ## functions.timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

        ## debug message
        if (Debug):
            print(
                colored("**DEBUG: retrieve results to summarize **", 'yellow'))
            pd.set_option('display.max_colwidth', None)
            pd.set_option('display.max_columns', None)
            print("MLST_results")
            print(MLST_results)

    ## generate summary for sample: all databases
    ## MLST, plasmids, genome, etc
    HCGB_aes.boxymcboxface("Results Summary")

    #####################################
    ## Summary identification results  ##
    #####################################

    ## parse results
    if options.project:
        final_dir = os.path.join(outdir, 'report', 'ident')
        HCGB_files.create_folder(final_dir)
    else:
        final_dir = outdir

    ###
    excel_folder = HCGB_files.create_subfolder("samples", final_dir)
    print('+ Print summary results in folder: ', final_dir)
    print('+ Print sample results in folder: ', excel_folder)

    # Group dataframe results summary by sample name
    sample_results_summary = dataFrame_kma.groupby(["Sample"])

    ## debug message
    if (Debug):
        print(colored("**DEBUG: sample_results_summary **", 'yellow'))
        print(sample_results_summary)

    ##
    results_summary_KMA = pd.DataFrame()
    MLST_all = pd.DataFrame()
    for name, grouped in sample_results_summary:

        ## create a excel and txt for sample
        name_sample_excel = excel_folder + '/' + name + '_ident.xlsx'
        name_sample_csv = outdir_dict[
            name] + '/ident_summary.csv'  ## check in detached mode

        writer_sample = pd.ExcelWriter(
            name_sample_excel, engine='xlsxwriter')  ## open excel handle

        ## subset dataframe	& print result
        results_summary_toPrint_sample = grouped[[
            'Sample', '#Template', 'Query_Coverage', 'Template_Coverage',
            'Depth', 'Database'
        ]]
        results_summary_toPrint_sample.to_excel(
            writer_sample, sheet_name="KMA")  ## write excel handle
        results_summary_toPrint_sample.to_csv(
            name_sample_csv)  ## write csv for sample

        ## read MLST
        if MLST_results:
            if name in MLST_results:
                sample_MLST = pd.read_csv(MLST_results[name],
                                          header=0,
                                          sep=',')
                sample_MLST['genus'] = dataFrame_edirect.loc[
                    dataFrame_edirect['sample'] == name, 'genus'].values[0]
                sample_MLST['species'] = dataFrame_edirect.loc[
                    dataFrame_edirect['sample'] == name, 'species'].values[0]
                sample_MLST.to_excel(writer_sample,
                                     sheet_name="MLST")  ## write excel handle

                ## Return information to excel
                MLST_all = pd.concat([MLST_all, sample_MLST])

        ## close excel handle
        writer_sample.save()

    ##
    name_excel = final_dir + '/identification_summary.xlsx'
    print('+ Summary information in excel file: ', name_excel)
    writer = pd.ExcelWriter(name_excel,
                            engine='xlsxwriter')  ## open excel handle

    ## KMA dataframe: print result for sources
    results_summary_KMA = dataFrame_kma[[
        'Sample', '#Template', 'Query_Coverage', 'Template_Coverage', 'Depth',
        'Database'
    ]]

    ## Sum plasmid and chromosome statistics ##
    ## sum coverage
    total_coverage = results_summary_KMA.groupby(
        'Sample')['Query_Coverage'].sum().reset_index()

    ## debug message
    if (Debug):
        print("*** Sum: Query_coverage ***")
        print(total_coverage)

    ## TODO: FIX SUMMARY REPORT
    results_summary_KMA = results_summary_KMA.set_index('Sample')
    results_summary_KMA = results_summary_KMA.sort_values(
        by=['Sample', 'Database', 'Query_Coverage'],
        ascending=[True, True, True])
    results_summary_KMA.to_excel(writer,
                                 sheet_name='KMA')  ## write excel handle

    ## write MLST
    if (MLST_results):
        MLST_all.to_excel(writer, sheet_name='MLST')

    ## write excel and close
    writer.save()  ## close excel handle

    print("\n+ Check summary of results in file generated")

    ### timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ######################################
    ## update database for later usage
    ######################################
    if not options.fast:

        HCGB_aes.boxymcboxface("Update Sample Database")

        ## update db
        print("+ Update database with samples identified")

        ## debug message
        if (Debug):
            print(colored("**DEBUG: dataFrame_edirect **", 'yellow'))
            pd.set_option('display.max_colwidth', None)
            pd.set_option('display.max_columns', None)
            print(dataFrame_edirect)

        ## dataFrame_edirect
        file_toprint = final_dir + '/edirect_info2download.csv'
        dataFrame_edirect.to_csv(file_toprint)

        ## update database with samples identified
        data2download = dataFrame_edirect.filter(
            ['genus', 'species', 'strain', 'genome'])
        data2download = data2download.rename(columns={
            'genome': 'NCBI_assembly_ID',
            'strain': 'name'
        })
        NCBI_folder = os.path.abspath(options.database) + '/NCBI'
        database_generator.NCBI_DB(data2download, NCBI_folder, Debug)

    else:
        print(
            "+ No update of the database has been requested using option --fast"
        )

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting identification module.")
    return ()
Exemple #21
0
def KMA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases,
              time_partial):
    """Kmer identification using software KMA_.
	
	:param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in...
	:param pd_samples_retrieved: pandas dataframe for samples to process.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	:param retrieve_databases: 
	:param time_partial: timestamp of start time of the process.
	
	:type options: 
	:type pd_samples_retrieved: pandas.DataFrame()
	:type outdir_dict: Dictionary
	:type retrieve_databases: pandas.DataFrame()
	:type time_partial: 
	
	:return: Information of the identification. See example below.
	:rtype: pandas.DataFrame()
	
	See example of returned dataframe in file :file:`/devel/results/KMA_ident_example.csv` here:
	
	.. include:: ../../devel/results/KMA_ident_example.csv
		:literal:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.config.set_config.get_exe`
	
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.modules.ident.send_kma_job`
		
		- :func:`BacterialTyper.modules.ident.get_outfile`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.parse_kma_results`
	
		
	.. include:: ../../links.inc	
	
	"""

    return (pd.DataFrame())

    ### print header
    HCGB_aes.boxymcboxface("KMA Identification")

    ## set defaults
    kma_bin = set_config.get_exe("kma")

    ## check status
    databases2use = []
    for index, db2use in retrieve_databases.iterrows():
        ## index_name
        if (str(db2use['source']).startswith('KMA')):
            print('+ Check database: ' + db2use['db'])
            fold_name = os.path.dirname(db2use['path'])

            index_status = species_identification_KMA.check_db_indexed(
                db2use['path'], fold_name)
            if (index_status == True):
                print(
                    colored(
                        "\t+ Databases %s seems to be fine...\n\n" %
                        db2use['db'], 'green'))
                databases2use.append(db2use['path'])
            else:
                #databases2use.remove(db2use)
                print(
                    colored(
                        "\t**Databases %s is not correctly indexed. Not using it...\n"
                        % db2use['db'], 'red'))

    ## debug message
    if (Debug):
        print(
            colored(
                "**DEBUG: databases2use\n" + "\n".join(databases2use) + "\n**",
                'yellow'))

    ## Start identification of samples
    print("\n+ Send KMA identification jobs...")

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        for db2use in databases2use:

            ## load database on memory
            print("+ Loading database on memory for faster identification.")
            return_code_load = species_identification_KMA.load_db(
                kma_bin, db2use)
            ## send for each sample
            commandsSent = {
                executor.submit(send_kma_job, outdir_dict[name],
                                sorted(cluster["sample"].tolist()), name,
                                db2use, threads_job, Debug): name
                for name, cluster in sample_frame
            }

            for cmd2 in concurrent.futures.as_completed(commandsSent):
                details = commandsSent[cmd2]
                try:
                    data = cmd2.result()
                except Exception as exc:
                    print('***ERROR:')
                    print(cmd2)
                    print('%r generated an exception: %s' % (details, exc))

            ## remove database from memory
            print("+ Removing database from memory...")
            return_code_rm = species_identification_KMA.remove_db(
                kma_bin, db2use)

            if (return_code_rm == 'FAIL'):
                print(
                    colored(
                        "***ERROR: Removing database from memory failed. Please do it manually! Execute command: %s"
                        % cmd_rm_db, 'red'))

            ## functions.timestamp
            time_partial = HCGB_time.timestamp(time_partial)

    ## parse results
    print("+ KMA identification call finished for all samples...")
    print("+ Parse results now")
    results_summary = pd.DataFrame()
    for db2use in databases2use:
        ### [TODO]: parse data according to database: bacteria, plasmids or user data or genbank data provided

        basename_db = os.path.basename(db2use)
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)

        ###
        for name, cluster in sample_frame:

            ## get result
            ## outdir_KMA
            outdir_dict_kma = HCGB_files.create_subfolder(
                "kma", outdir_dict[name])
            result = get_outfile(outdir_dict_kma, name, db2use)
            #print ('\t- File: ' + result + '.spa')

            ## get results using a cutoff value [Defaulta: 80]
            results = species_identification_KMA.parse_kma_results(
                result + '.spa', options.KMA_cutoff)
            results['Database'] = basename_db

            ### check if db2use is plasmids as it could be several.
            if (results.index.size > 1):
                if (basename_db == "plasmids.T" or basename_db == "viral.TG"):
                    ## let it be several entries
                    results['Sample'] = name
                    results_summary = results_summary.append(results,
                                                             ignore_index=True)
                else:
                    print(
                        colored("###########################################",
                                'yellow'))
                    print(
                        colored("Sample %s contains multiple strains." % name,
                                'yellow'))
                    print(
                        colored("###########################################",
                                'yellow'))
                    print(colored(results, 'yellow'))
                    print('\n\n')

                    ## add both strains if detected
                    results['Sample'] = name
                    results_summary = results_summary.append(results,
                                                             ignore_index=True)

                    ## TODO: add multi-isolate flag

            elif (results.index.size == 1):  ## 1 clear reference
                results['Sample'] = name
                results_summary = results_summary.append(results,
                                                         ignore_index=True)

            else:
                print(
                    colored(
                        '\tNo clear strain from database %s has been assigned to sample %s'
                        % (basename_db, name), 'yellow'))
                ## add empty line if no available
                results['Sample'] = name
                results_summary = results_summary.append(results,
                                                         ignore_index=True)

    print("+ Finish this step...")

    ## debug message
    if (Debug):
        results_summary.to_csv(quotechar='"')

    return (results_summary)
Exemple #22
0
def run_input(arg_dict):
    """Main function of the input_parser module in BacDup package.
    
    This module prepares data for later gene duplication analysis. 
    
    It allows the user to provide either a single sample, multiple samples, NCBI 
    GenBank IDs or NCBI taxonomy IDs to retrieve and obtain the annotation data.    
    """

    ## help message
    if (arg_dict.input_help):
        help_input()
        exit()

    BacDup_functions.pipeline_header('BacDup')
    HCGB_aes.boxymcboxface("Preparing input files")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    #input_dir = os.path.abspath(options.input)
    outdir = os.path.abspath(arg_dict.output_folder)

    ## output folder
    print("\n+ Create output folder(s):")
    HCGB_files.create_folder(outdir)

    ## set defaults
    if not (arg_dict.assembly_level):
        arg_dict.assembly_level = 'complete'
    if not (arg_dict.section):
        arg_dict.section = 'genbank'

    ## project or detached?
    if arg_dict.detached:
        arg_dict.project = False
        final_dir = outdir
        data_dir = outdir
    else:
        arg_dict.project = True
        print(
            "+ Generate a directory containing information within the project folder provided"
        )
        final_dir = HCGB_files.create_subfolder("info", outdir)

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('Project/Detached option:', 'yellow')
        debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow')
        debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow')
        debug_message('outdir:' + outdir, 'yellow')
        debug_message('final_dir:' + final_dir, 'yellow')
        debug_message('+++++++++++++++++++++++++++++++')

    ## get files
    print()
    HCGB_aes.print_sepLine("-", 50, False)
    print('+ Getting input information provided... ')
    print('+ Several options available:')
    print('\t* Single/Multiple Annotation file:')
    print('\t  |-- GenBank format files')
    print('\t  |-- GFF files +  Reference fasta files required')
    print('\n\t* Single/Multiple NCBI GenBank IDs')
    print('\n\t* Single/Multiple NCBI taxonomy IDs + Options')
    print('\n\t* A previous BacDup project folder')

    print('\n+ Check the option provided...')
    time.sleep(1)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    #################################################
    ## Parse and obtain the type of input information provided
    #################################################
    df_accID = parse_options(arg_dict)
    ## pd.DataFrame: 'new_name','folder','genus',
    ##               'species','taxonomy','genome',
    ##               'annot_file','format_annot_file', 'proteins',
    ##               'plasmids_number','plasmids_ID'))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ## parse information accordingly
    parse_information(arg_dict, df_accID, outdir)

    ### report generation
    HCGB_aes.boxymcboxface("Summarizing input files")
    outdir_report = HCGB_files.create_subfolder("report", outdir)

    input_report = HCGB_files.create_subfolder("input", outdir_report)

    ## add df_accID.loc[sample,] information as csv into input folder
    df_accID.to_csv(os.path.join(input_report, 'info.csv'),
                    index=True,
                    header=True)

    ## maybe add a summary of the files?

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Input module.")
    return ()
Exemple #23
0
def get_assembly_stats_all(assembly_stats_dict, outdir, debug):
    ## get all assembly stats
    outdir_report = HCGB_files.create_subfolder("report", outdir)
    final_dir = HCGB_files.create_subfolder("assembly_stats", outdir_report)
    final_sub_dir = HCGB_files.create_subfolder("samples", final_dir)

    #### summary and information
    results_summary_toPrint_all = pd.DataFrame()
    column_names = ("Type", "Sample", "Total Sequences", "GC% Content",
                    "Longest sequence", "Shortest sequence", "Median length",
                    "Mean length", "Total Length (bp)", "L10", "N10", "L20",
                    "N20", "L30", "N30", "L40", "N40", "L50", "N50")

    ## debugging messages
    if debug:
        HCGB_aes.debug_message("Create assembly statistic for all samples")

    for sample_name in assembly_stats:
        excel_file_stats = assembly_stats[sample_name][1]

        if debug:
            HCGB_aes.debug_message("sample_name: " + sample_name, 'yellow')
            HCGB_aes.debug_message("excel: " + excel_file_stats, 'yellow')
            HCGB_aes.debug_message("contig stats dictionary: ", 'yellow')
            print(assembly_stats[sample_name][0]['Contig Stats'])
            HCGB_aes.debug_message("scaffold stats dictionary: ", 'yellow')
            print(assembly_stats[sample_name][0]['Scaffold Stats'])

        # get contig
        contig_stats = pd.DataFrame.from_dict(
            assembly_stats[sample_name][0]['Contig Stats'],
            orient='index').transpose()
        contig_stats['type'] = 'contigs'
        contig_stats['sample_name'] = sample_name

        # get scaffold
        scaff_stats = pd.DataFrame.from_dict(
            assembly_stats[sample_name][0]['Scaffold Stats'],
            orient='index').transpose()
        scaff_stats['type'] = 'scaffolds'
        scaff_stats['sample_name'] = sample_name

        ## copy individual excel file
        shutil.copy(excel_file_stats, final_sub_dir)

        ## add all data
        results_summary_toPrint_all = pd.concat(
            [results_summary_toPrint_all, contig_stats, scaff_stats],
            ignore_index=True)

    ## reorder columns
    cols = results_summary_toPrint_all.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    cols = cols[-1:] + cols[:-1]
    results_summary_toPrint_all = results_summary_toPrint_all[cols]

    ## write to excel
    name_excel_summary = final_dir + '/summary_stats.xlsx'
    writer_summary = pd.ExcelWriter(name_excel_summary,
                                    engine='xlsxwriter')  ## open excel handle

    ## filter important columns
    results_summary_toPrint_all = results_summary_toPrint_all.set_axis(
        column_names, 1)

    ## save in excel
    results_summary_toPrint_all.to_excel(
        writer_summary, sheet_name="all_data")  ## write excel handle
    writer_summary.save()  ## close excel handle
Exemple #24
0
def run_biotype(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_XICRA.help_fastq_format()
    elif (options.help_project):
        ## information for project
        help_XICRA.project_help()
        exit()
    elif (options.help_RNAbiotype):
        ## information for join reads
        RNAbiotype.help_info()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    aesthetics_functions.pipeline_header('XICRA')
    aesthetics_functions.boxymcboxface("RNA biotype analysis")
    print("--------- Starting Process ---------")
    time_functions.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    if (options.detached):
        outdir = os.path.abspath(options.output_folder)
        options.project = False
    else:
        options.project = True
        outdir = input_dir

    ## get files
    print('+ Getting files from input folder... ')

    ## get files
    if options.noTrim:
        print('+ Mode: fastq.\n+ Extension: ')
        print("[ fastq, fq, fastq.gz, fq.gz ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
            options.debug)

    else:
        print('+ Mode: trim.\n+ Extension: ')
        print("[ _trim_ ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)

        ## Discard if joined reads: use trimmed single-end or paired-end
        pd_samples_retrieved = pd_samples_retrieved[
            pd_samples_retrieved['ext'] != '_joined']

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        files_functions.create_folder(outdir)

    ## for samples
    mapping_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "map", options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_outdir_dict **", 'yellow'))
        print(mapping_outdir_dict)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_total)

    ## optimize threads
    name_list = set(pd_samples_retrieved["new_name"].tolist())
    threads_job = main_functions.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ##############################################
    ## map Reads
    ##############################################
    start_time_partial = mapReads_module(options, pd_samples_retrieved,
                                         mapping_outdir_dict, options.debug,
                                         max_workers_int, threads_job,
                                         start_time_partial, outdir)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_results **", 'yellow'))
        print(mapping_results)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    ## for samples
    biotype_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "biotype",
        options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: biotype_outdir_dict **", 'yellow'))
        print(biotype_outdir_dict)

    ## get RNAbiotype information
    RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict,
                                      options.annotation, options.debug,
                                      max_workers_int, threads_job)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print(
            "\n+ Generating a report using MultiQC module for featureCount analysis."
        )
        outdir_report = files_functions.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders:"
        )

        ## call multiQC report module
        givenList = [v for v in biotype_outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            print(
                colored("\n**DEBUG: my_outdir_list for multiqc report **",
                        'yellow'))
            print(my_outdir_list)
            print("\n")

        featureCount_report = files_functions.create_subfolder(
            "featureCount", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "featureCount",
                                           featureCount_report, "-dd 2")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % featureCount_report)

        ### Summarizing RNA biotype information
        biotype_report = files_functions.create_subfolder(
            "biotype", outdir_report)
        single_files_biotype = files_functions.create_subfolder(
            "samples", biotype_report)

        ## results
        dict_files = {}

        for samples in biotype_outdir_dict:
            featurecount_file = os.path.join(biotype_outdir_dict[samples],
                                             'featureCount.out.tsv')
            if files_functions.is_non_zero_file(featurecount_file):
                dict_files[samples] = featurecount_file
            ## copy pdf
            pdf_plot = main_functions.retrieve_matching_files(
                biotype_outdir_dict[samples], '.pdf', options.debug)
            if files_functions.is_non_zero_file(pdf_plot[0]):
                shutil.copy(pdf_plot[0], single_files_biotype)

        ## collapse all information
        all_data = RNAbiotype.generate_matrix(dict_files)

        ## print into excel/csv
        print('+ Table contains: ', len(all_data), ' entries\n')

        ## debugging messages
        if Debug:
            print("** DEBUG: all_data")
            print(all_data)

        ## set abs_csv_outfile to be in report folder
        ## copy or link files for each sample analyzed
        abs_csv_outfile = os.path.join(biotype_report, "summary.csv")
        all_data.to_csv(abs_csv_outfile)

        ## create plot: call R [TODO: implement in python]
        outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf")

        ## R scripts
        biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum',
                                           options.debug)
        rscript = set_config.get_exe("Rscript", options.debug)
        cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script,
                                            abs_csv_outfile, outfile_pdf)

        ##
        print("+ Create summary plot for all samples")
        callCode = system_call_functions.system_call(cmd_R_plot)

    print("\n*************** Finish *******************")
    start_time_partial = time_functions.timestamp(start_time_total)
    print("\n+ Exiting join module.")
    return ()
Exemple #25
0
def edirect_ident(dataFrame, outdir_dict, Debug):
    """Connect to NCBI for information retrieval
	
	This functions uses the software edirect_ to connect to NCBI and retrieve some information regarding samples, assemblies, publications, etc.
	
	:param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	
	:type dataFrame: pandas.DataFrame()
	:type outdir_dict: Dictionary
	
	:return: Information of the identification 
	:rtype: pandas.DataFrame()
	
	See example of returned dataframe in file :file:`/devel/results/edirect_download_results.csv` here:
	
	.. include:: ../../devel/results/edirect_download_results.csv
		:literal:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.get_info_file`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
	
		- :func:`BacterialTyper.scripts.functions.print_time_stamp`

		- :func:`BacterialTyper.scripts.functions.optimize_threads`
	
		- :func:`BacterialTyper.scripts.functions.create_subfolder`
	
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.scripts.functions.is_non_zero_file`
	
		- :func:`BacterialTyper.scripts.edirect_caller.generate_docsum_call`
		
		- :func:`BacterialTyper.scripts.edirect_caller.generate_xtract_call`
		
	.. include:: ../../links.inc	
	"""
    ################################################
    ## TODO: What to do if multi-isolate sample?
    ################################################

    ## edirect
    HCGB_aes.boxymcboxface("EDirect information")
    print("+ Connect to NCBI to get information from samples identified...")

    ## create dataframe to return results
    edirect_frame = pd.DataFrame(columns=("sample", "genus", "species",
                                          "strain", "BioSample", "genome",
                                          "Plasmids"))

    ## debugging messages
    if Debug:
        print("*******************************************************")
        print("Dataframe sample_results: ")

    # Group dataframe sample name
    sample_results = dataFrame.groupby(["Sample"])

    for name, grouped in sample_results:
        ## debugging messages
        if Debug:
            print("Name: ", name)
            print(grouped)

        ## use edirect to get Species_name and entry for later identification
        edirect_folder = HCGB_files.create_subfolder('edirect',
                                                     outdir_dict[name])

        ## chromosome match
        if (len(grouped.loc[grouped['Database'] == 'bacteria.ATG']
                ['#Template']) == 0):
            if Debug:
                print("Name: ", name)
                print("No chromosome match identified by kmer")

            genus = ''
            species = ''
            BioSample_name = ''
            AssemblyAcc = ''

        else:
            nucc_entry = grouped.loc[grouped['Database'] == 'bacteria.ATG'][
                '#Template'].values[0].split()
            ## e.g. NZ_CP029680.1 Staphylococcus aureus strain AR_0215 chromosome, complete genome

            ##
            out_docsum_file = edirect_folder + '/nuccore_docsum.txt'
            tmp_species_outfile = edirect_folder + '/info.csv'
            filename_stamp = edirect_folder + '/.success_species'

            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous command generated results on: %s [%s]" %
                        (stamp, name), 'yellow'))
                status = True
            else:
                edirect_caller.generate_docsum_call('nuccore', nucc_entry[0],
                                                    out_docsum_file)
                status = edirect_caller.generate_xtract_call(
                    out_docsum_file, 'DocumentSummary',
                    'Organism,BioSample,AssemblyAcc,Strain',
                    tmp_species_outfile)

            ########################################
            ## get information from edirect call
            ########################################
            if not status:
                print("NO INFORMATION")
                continue

            taxa_name_tmp = HCGB_main.get_info_file(tmp_species_outfile)
            Organism = taxa_name_tmp[0].split(',')[0].split()
            genus = Organism[0]  ## genus
            species = Organism[1]  ## species
            BioSample_name = taxa_name_tmp[0].split(',')[1]  ## BioSample
            AssemblyAcc = taxa_name_tmp[0].split(',')[2]  ## AssemblyAcc

            ## sometimes strain is missing
            if len(taxa_name_tmp[0].split(',')) > 3:
                strain = taxa_name_tmp[0].split(',')[3]  ## strain
            else:
                strain = 'NaN'

            ## get GenBank accession ID
            out_docsum_file_assembly = edirect_folder + '/assembly_docsum.txt'
            AssemblyAcc_outfile = edirect_folder + '/AssemblyAcc.csv'

            edirect_caller.generate_docsum_call('assembly', AssemblyAcc,
                                                out_docsum_file_assembly)
            edirect_caller.generate_xtract_call(out_docsum_file_assembly,
                                                'DocumentSummary', 'Genbank',
                                                AssemblyAcc_outfile)

            ## some error occurred
            if not HCGB_main.is_non_zero_file(out_docsum_file_assembly):
                continue

            ## Is it better to download Refseq or Genbank?
            ## https://www.quora.com/What-is-the-difference-between-Refseq-and-Genbank

            GenbankAcc = HCGB_main.get_info_file(AssemblyAcc_outfile)
            if Debug:
                print("Sample: ", name)
                print("Genbank Acc: ", GenbankAcc[0])

        ## plasmid match
        group_plasmid = grouped.loc[grouped['Database'] == 'plasmids.T']
        plasmid_entries = group_plasmid['#Template'].tolist()
        ## e.g. NZ_CP029083.1 Staphylococcus aureus strain AR464 plasmid unnamed1, complete sequence
        plasmid_entries_str = ",".join([i.split()[0] for i in plasmid_entries])

        ## save edirect_frame
        #("sample", "taxa", strain, genome "BioSample", "Plasmids"))
        edirect_frame.loc[len(edirect_frame)] = (name, genus, species, strain,
                                                 BioSample_name, GenbankAcc[0],
                                                 plasmid_entries_str)

        stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## debugging messages
    if Debug:
        print("*******************************************************")

    return (edirect_frame)
Exemple #26
0
def run(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_info.help_fastq_format()
        exit()
    elif (options.help_trimm_adapters):
        ## help on trimm adapters
        trimmomatic_call.print_help_adapters()
        exit()
    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Trimming samples")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    if (options.detached):
        options.project = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
        options.debug)

    ## debug message
    if (Debug):
        HCGB_aes.debug_message("pd_samples_retrieved", 'yellow')
        HCGB_main.print_all_pandaDF(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "trimm",
                                            options.debug)

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print("+ Trimming adapters for each sample retrieved...")

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    # Trimming adapters
    if (options.adapters):
        # Adapter file provided
        options.adapters = os.path.abspath(options.adapters)
        print("\t- Adapters file provided...")
    else:
        # Get default adpaters file
        print("\t- Default Trimmomatic adapters (v0.39) will be used...")
        options.adapters = data_files.data_list(
            "available_Trimmomatic_adapters")

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(trimmo_caller, sorted(cluster["sample"].tolist()),
                            outdir_dict[name], name, threads_job, Debug,
                            options.adapters): name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    print("\n\n+ Trimming samples has finished...")
    ## functions.timestamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get files generated and generate symbolic link
    if not options.project:
        dir_symlinks = HCGB_files.create_subfolder('link_files', outdir)
        files2symbolic = []
        folders = os.listdir(outdir)

        ## debug message
        if (Debug):
            print(
                colored(
                    "**DEBUG: generate symbolic links for each file in " +
                    dir_symlinks + "**", 'yellow'))

        for fold in folders:
            if fold.endswith(".log"):
                continue
            else:
                this_folder = outdir + '/' + fold
                subfiles = os.listdir(this_folder)
                for files in subfiles:
                    files_search = re.search(
                        r".*trim_R\d{1}.*",
                        files)  ## only paired-end. Todo: single end
                    if files_search:
                        files2symbolic.append(this_folder + '/' + files)

        HCGB_files.get_symbolic_link(files2symbolic, dir_symlinks)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print("\n+ Generating a report using MultiQC module.")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        ## call multiQC report module
        givenList = [v for v in outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            HCGB_aes.debug_message("my_outdir_list for multiqc report",
                                   "yellow")
            print(my_outdir_list)
            print("\n")

        trimm_report = HCGB_files.create_subfolder("trimm", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "Trimmomatic",
                                           trimm_report, "")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % trimm_report)

        ## create fastqc for trimmed reads
        pd_samples_retrieved_trimmed = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)
        qc.fastqc(pd_samples_retrieved_trimmed, outdir, options,
                  start_time_partial, "trimmed", Debug)

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)
    print("\n+ Exiting trimm module.")
    return ()
Exemple #27
0
def ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases,
                start_time_partial):
    HCGB_aes.boxymcboxface("ARIBA Identification")

    ##################
    ## check status	##
    ##################
    databases2use = []  ## path, db name
    card_trick_info = ""
    print('+ Check databases status: ')
    for index, db2use in retrieve_databases.iterrows():
        ## index_name
        if (db2use['source'] == 'ARIBA'):
            index_status = ariba_caller.check_db_indexed(db2use['path'], 'YES')
            if (index_status == True):
                #print (colored("\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green'))
                databases2use.append([db2use['path'], db2use['db']])

                ## prepare card database ontology for later
                if (db2use['db'] == 'card'):
                    card_trick_info = card_trick_caller.prepare_card_data(
                        options.database)

        ## check status of other databases if any
        # else:

    ## debug message
    if (Debug):
        print(colored("**DEBUG: databases2use\n**", 'yellow'))
        print(databases2use)
        if (card_trick_info):
            print(
                colored("**DEBUG: card_trick_info: " + card_trick_info + " **",
                        'yellow'))

    ######################################################
    ## Start identification of samples
    ######################################################
    print("\n+ Send ARIBA identification jobs...")

    ## get outdir folders
    outdir_samples = pd.DataFrame(columns=('sample', 'dirname', 'db',
                                           'output'))

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    for name, cluster in sample_frame:
        for db2use in databases2use:
            tmp = get_outfile(outdir_dict[name], name, db2use[0])
            outdir_samples.loc[len(outdir_samples)] = (name, outdir_dict[name],
                                                       db2use[1], tmp)

    ## multi-index
    outdir_samples = outdir_samples.set_index(['sample', 'db'])

    ## debug message
    if (Debug):
        print(colored("**DEBUG: outdir_samples **", 'yellow'))
        print(outdir_samples)

    ######################################################
    ## send for each sample
    ######################################################
    ## ariba assembly cutoff
    if not (options.ARIBA_cutoff):
        options.ARIBA_cutoff = 0.90

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## loop
    results_df = pd.DataFrame()
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        for db2use in databases2use:
            print(colored("+ Working with database: " + db2use[1], 'yellow'))
            ## send for each sample
            commandsSent = {
                executor.submit(
                    ariba_run_caller,
                    db2use[0],
                    db2use[1],  ## database path & dbname
                    sorted(cluster["sample"].tolist()),  ## files
                    outdir_samples.loc[(name, db2use[1]), 'output'],  ## output
                    threads_job,
                    options.ARIBA_cutoff): name
                for name, cluster in sample_frame
            }

            for cmd2 in concurrent.futures.as_completed(commandsSent):
                details = commandsSent[cmd2]
                try:
                    data = cmd2.result()
                except Exception as exc:
                    print('***ERROR:')
                    print(cmd2)
                    print('%r generated an exception: %s' % (details, exc))

            print("+ Jobs finished for database %s ..." % db2use[1])

            ## functions.timestamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

            print()
            print(
                "+ Collecting information for each sample analyzed for database: "
                + db2use[1])
            ## check results for each database
            results_df_tmp = virulence_resistance.check_results(
                db2use[1], outdir_samples, options.ARIBA_cutoff,
                card_trick_info)
            results_df = pd.concat([results_df, results_df_tmp])

            ## functions.timestamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

    ######################################################
    ## Generate final report for all samples
    ######################################################
    ## ariba summary results all samples
    print(
        "\n + Generate a summary file for all samples and one for each database employed..."
    )

    ## parse results
    if Project:
        final_dir = input_dir + '/report/profile'
        HCGB_files.create_folder(final_dir)
    else:
        final_dir = os.path.abspath(options.output_folder)

    ##
    vfdb = False
    subfolder = HCGB_files.create_subfolder("ariba_summary", final_dir)
    ## subfolder_samples = functions.create_subfolder("samples", final_dir) ## TODO: Copy all xlsx files to a common folder. Is it necessary?

    ## open excel writer
    name_excel = final_dir + '/profile_summary.xlsx'
    writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')

    for database, data in outdir_samples.groupby(level='db'):  ## fix
        report_files_databases = {}

        for sample, data2 in data.groupby(level='sample'):  ## fix
            file_report = data2.loc[sample, database]['output'] + '/report.tsv'
            if os.path.isfile(file_report):  ## check if exists
                report_files_databases[sample] = file_report

        outfile_summary = subfolder + "/"
        if database.endswith('card_prepareref/'):
            outfile_summary = outfile_summary + 'CARD_summary'
            name_db = 'CARD'
        elif database.endswith('vfdb_full_prepareref/'):
            outfile_summary = outfile_summary + 'VFDB_summary'
            name_db = 'VFDB'
            vfdb = True
        else:
            ## TODO: check if there are multiple 'other' databases
            ## Different databases provided (different to VFDB and CARD) would collapse file
            outfile_summary = outfile_summary + 'Other_summary'
            name_db = 'other'

        ## call ariba summary to summarize results
        csv_all = ariba_caller.ariba_summary_all(outfile_summary,
                                                 report_files_databases)
        if not csv_all == 'NaN':
            csv2excel = pd.read_csv(csv_all, header=0, sep=',')
            ## write excel
            name_tab = name_db + '_found'
            csv2excel.to_excel(writer, sheet_name=name_tab)

    ## results_df contains excel and csv files for each sample and for each database
    list_databases = set(results_df['database'].to_list())
    for db in list_databases:
        df_db = results_df[results_df['database'] == db]['csv']
        dict_samples = df_db.to_dict()

        merge_df = pd.DataFrame()
        for sample in dict_samples:

            if os.path.isfile(dict_samples[sample]):
                df = pd.read_csv(dict_samples[sample], header=0, sep=",")
                df = df.set_index('Genes')
                df2 = df.rename(columns={'Status': sample}, inplace=True)
                df2 = df[[sample]]

                ## add to a common dataframe
                merge_df = pd.concat([merge_df, df2], axis=1, sort=True)
                merge_df.fillna("NaN", inplace=True)

        trans_df = merge_df.transpose()
        ## write excel
        name_tab = db + '_all'
        trans_df.to_excel(writer, sheet_name=name_tab)

    ## close
    writer.save()

    ######################################################
    ## print additional information for VFDB
    ######################################################
    if (vfdb):
        print("\n\n")
        HCGB_aes.print_sepLine("*", 50, False)
        print("+ Check VFDB details in files downloaded from vfdb website:")
        files_VFDB = virulence_resistance.check_VFDB(final_dir +
                                                     '/VFDB_information')
        HCGB_aes.print_sepLine("*", 50, False)

    ######################################################
    print("\n+ Please check additional summary files generated at folder ",
          final_dir)
    print("+ Go to website: https://jameshadfield.github.io/phandango/#/")
    print(
        "+ For each database upload files *phandango.csv and *phandango.tre and visualize results"
    )
Exemple #28
0
def MLST_ident(options, dataFrame, outdir_dict, dataFrame_edirect,
               retrieve_databases):
    """Generate MLST profile identification
	
	This functions uses the `MLSTar software`_ to retrieve Multi locus sequence typing (MLST) profiles from PubMLST_ for the given species previously identified by KMA. It generates MLST profiling for each sample. 
	
	:param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in...
	:param dataFrame: pandas dataframe for samples to process. Result from :func:`BacterialTyper.modules.ident.KMA_ident`.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	:param dataFrame_edirect: pandas dataframe resulted from :func:`BacterialTyper.modules.ident.edirect_ident`.
	:param retrieve_databases: 
	
	:type options: 
	:type dataFrame: pandas.DataFrame()
	:type outdir_dict: Dictionary
	:type dataFrame_edirect: pandas.DataFrame()
	:type retrieve_databases: pandas.DataFrame()
	
	:return: Information of the MLST identification. Dictionary keys are samples and values are the absolute path to file generate by :func:`BacterialTyper.scripts.MLSTar.run_doMLST` containing MLST information.
	:rtype: Dictionary

	
	See example of returned dataframe in file :file:`/devel/results/doMLST_result_example.csv` here:
	
	.. include:: ../../devel/results/doMLST_result_example.csv
		:literal:
	
	.. seealso:: Additional information to PubMLST available datasets.
	
		- :doc:`PubMLST datasets<../../../data/PubMLST_datasets>`
	
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`
	
		- :func:`BacterialTyper.scripts.functions.create_subfolder`
		
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.scripts.MLSTar.run_MLSTar`
		
		- :func:`HCGB.sampleParser.files.get_files`
		
		- :func:`BacterialTyper.scripts.MLSTar.get_MLSTar_species`
		
	.. include:: ../../links.inc	
	"""
    ## set config
    rscript = set_config.get_exe("Rscript")

    ## TODO: Samples might not be assembled...to take into account and return 0

    ## TODO: Fix and install MLSTar during installation
    print(MLSTar.get_MLSTar_package_installed())
    exit()

    ########################################################################################

    ## TODO: What to do if multi-isolate sample?
    ## TODO: Control if a different profile is provided via --MLST_profile
    ## TODO: Check time passed and download again if >?? days passed]

    ## debug message
    if (Debug):
        print(colored("**DEBUG: dataFrame_edirect identified**", 'yellow'))
        print(dataFrame_edirect)

    ## MLST call
    HCGB_aes.boxymcboxface("MLST typing")
    print(
        "+ Create classical MLST typification of each sample according to species retrieved by kmer..."
    )

    ## get assembly files
    input_dir = os.path.abspath(options.input)
    assembly_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: assembly_samples_retrieved**", 'yellow'))
        print(assembly_samples_retrieved)

    # init
    MLST_results = {}

    ## get MLST_profile: default or provided
    mlst_profile_list = retrieve_databases.loc[retrieve_databases['db'] ==
                                               'PubMLST']['path'].tolist()

    if (Debug):
        print("** Debug **")
        print("mlst_profile_list")
        print(mlst_profile_list)

        print("dataFrame_edirect")
        print(dataFrame_edirect)

    ## Generate MLST call according to species identified for each sample
    for index, row in dataFrame_edirect.iterrows():
        MLSTar_taxa_name = MLSTar.get_MLSTar_species(row['genus'],
                                                     row['species'])

        if (MLSTar_taxa_name == 'NaN'):
            print(
                colored(
                    "\t- Not available PubMLST profile for sample [%s] identified as %s %s"
                    % (row['sample'], row['genus'], row['species']), 'yellow'))

        else:
            for mlst_profile in mlst_profile_list:

                ## species folder
                #species_mlst_folder = functions.create_subfolder(MLSTar_taxa_name, pubmlst_folder)
                species_mlst = mlst_profile.split(',')[0]
                species_mlst_folder = mlst_profile.split(',')[1]

                ## output file
                output_file = species_mlst_folder + '/PubMLST_available_scheme.csv'
                filename_stamp = species_mlst_folder + '/.success_scheme'

                ##
                if MLSTar_taxa_name == species_mlst:
                    if os.path.isfile(filename_stamp):
                        stamp = HCGB_time.read_time_stamp(filename_stamp)
                        print(
                            colored(
                                "\tA previous command generated results on: %s"
                                % stamp, 'yellow'))
                    else:
                        ### get scheme available
                        MLSTar.getPUBMLST(MLSTar_taxa_name, rscript,
                                          output_file)
                        stamp = HCGB_time.print_time_stamp(filename_stamp)

                    ## parse and get scheme for classical MLST
                    schemes_MLST = pd.read_csv(output_file, sep=',', header=0)

                    ##
                    for item, cluster in schemes_MLST.iterrows():
                        if cluster['len'] < 10:
                            scheme2use = int(cluster['scheme'])
                            continue
                    ###
                    sample = row['sample']
                    MLSTar_folder = HCGB_files.create_subfolder(
                        'MLST', outdir_dict[sample])
                    genome_file = assembly_samples_retrieved.loc[
                        assembly_samples_retrieved['name'] ==
                        sample]['sample'].values[0]

                    ## call MLST
                    (results, profile_folder) = MLSTar.run_MLSTar(
                        species_mlst_folder, rscript, MLSTar_taxa_name,
                        scheme2use, sample, MLSTar_folder, genome_file,
                        options.threads)
                    MLST_results[sample] = results

    ##
    print("+ Finish this step...")
    return (MLST_results)
Exemple #29
0
def run_cluster(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_Mash):
        ## information for Min Hash Software
        min_hash_caller.helpMash()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Clustering samples")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    if options.reads:
        if options.noTrim:
            ## raw reads
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "fastq",
                ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug)
        else:
            ## trimm reads
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "trim", ['_trim'], options.debug)

        ## keep only R1 reads if paired-end
        if options.pair:
            pd_samples_retrieved = pd_samples_retrieved.loc[
                pd_samples_retrieved['read_pair'] == "R1"]

    else:
        ## default
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    # exit if empty
    if pd_samples_retrieved.empty:
        print(
            "No data has been retrieved from the project folder provided. Exiting now..."
        )
        exit()

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)

    ## for each sample
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "mash",
                                            options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: outdir_dict **", 'yellow'))
        print(outdir_dict)

    ## get databases to check
    retrieve_databases = get_options_db(options)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## remove samples if specified
    if options.ex_sample:
        ex_samples = HCGB_main.get_info_file(options.ex_sample)
        retrieve_databases = retrieve_databases.loc[~retrieve_databases.index.
                                                    isin(ex_samples)]

    ## debug message
    if (Debug):
        print(colored("**DEBUG: retrieve_database **", 'yellow'))
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)
        print(retrieve_databases)

    ## check if all samples in user_data or genbank are indexed
    siglist_all = []
    for index, row in retrieve_databases.iterrows():
        if not row['path'] == 'NaN':
            if (Debug):
                HCGB_aes.print_sepLine("*", 25, False)
                print(row)

            if all([
                    int(options.kmer_size) == int(row['ksize']),
                    int(options.n_sketch) == int(row['num_sketch'])
            ]):
                siglist_all.append(
                    min_hash_caller.read_signature(row['path'],
                                                   options.kmer_size))
                continue

        ## index assembly or reads...
        (sigfile, siglist) = generate_sketch(row['folder'], row['original'],
                                             index, options.kmer_size,
                                             options.n_sketch, Debug)
        retrieve_databases.loc[index]['path'] = sigfile
        retrieve_databases.loc[index]['ksize'] = options.kmer_size
        retrieve_databases.loc[index]['num_sketch'] = options.n_sketch
        siglist_all.append(siglist)

    ### Cluster project samples
    print(colored("\n+ Collect project data", 'green'))
    print("+ Generate mash sketches for each sample analyzed...")
    pd_samples_retrieved = pd_samples_retrieved.set_index('name')

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieved **", 'yellow'))
        print(pd_samples_retrieved)

    ## init dataframe for project data
    colname = ["source", "name", "path", "original", "ksize", "num_sketch"]
    pd_samples_sketched = pd.DataFrame(columns=colname)
    for index, row in pd_samples_retrieved.iterrows():
        if index in retrieve_databases.index:
            print(
                colored(
                    '\t+ Sketched signature (%s) available within user data...'
                    % index, 'yellow'))
            continue

        this_sig = outdir_dict[index] + '/' + index + '.sig'
        if os.path.exists(this_sig):
            ## File signature might exist

            ## read original
            file2print = outdir_dict[index] + '/.original'
            if not os.path.exists(file2print):
                original = ['NaN']
            else:
                original = HCGB_main.readList_fromFile(file2print)
                if all([
                        int(options.kmer_size) == int(original[1]),
                        int(options.n_sketch) == int(original[2])
                ]):
                    siglist_all.append(
                        min_hash_caller.read_signature(this_sig,
                                                       options.kmer_size))
                    pd_samples_sketched.loc[len(pd_samples_sketched)] = (
                        'project_data', index, this_sig, row['sample'],
                        options.kmer_size, options.n_sketch)
                    print(
                        colored(
                            '\t+ Sketched signature available (%s) in project folder...'
                            % index, 'green'))
                    continue

        print(
            colored('\t+ Sketched signature to be generated: (%s)...' % index,
                    'yellow'))
        ## index assembly or reads...
        (sigfile, siglist) = generate_sketch(outdir_dict[index], row['sample'],
                                             index, options.kmer_size,
                                             options.n_sketch, Debug)
        pd_samples_sketched.loc[len(pd_samples_sketched)] = ('project_data',
                                                             index, sigfile,
                                                             row['sample'],
                                                             options.kmer_size,
                                                             options.n_sketch)
        siglist_all.append(siglist)

    print("\n+ Clustering sequences...")
    pd_samples_sketched = pd_samples_sketched.set_index('name')

    ####
    if retrieve_databases.empty:
        cluster_df = pd_samples_sketched
    else:
        tmp = retrieve_databases[[
            'source', 'db', 'path', 'original', 'ksize', 'num_sketch'
        ]]
        tmp = tmp.rename(columns={'db': 'name'})
        tmp.set_index('name')

        if (Debug):
            print(colored("**DEBUG: tmp **", 'yellow'))
            print(tmp)

        ## merge both dataframes
        cluster_df = pd.concat([pd_samples_sketched, tmp],
                               join='inner',
                               sort=True)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_sketched **", 'yellow'))
        print(pd_samples_sketched)

        print(colored("**DEBUG: cluster_df **", 'yellow'))
        print(cluster_df)

        print(colored("**DEBUG: Signatures **", 'yellow'))
        print(siglist_all)

        print(colored("**DEBUG: length siglist_all **", 'yellow'))
        print(len(siglist_all))

    ## Assign Colors colorLabels
    color_df = cluster_df.filter(["source"], axis=1)
    color_df["color"] = "r"  ## red::genbank

    ## project data
    project_data = list(color_df[color_df["source"] == "project_data"].index)
    color_df.loc[color_df.index.isin(project_data),
                 "color"] = "g"  ## green::project_data

    ## user_data
    user_data = list(color_df[color_df["source"] == "user_data"].index)
    color_df.loc[color_df.index.isin(user_data),
                 "color"] = "b"  ## blue::user_data

    colorLabels = color_df['color'].to_dict()

    if Debug:
        print(color_df)
        print(colorLabels)

    ## parse results
    if options.project:
        outdir_report = HCGB_files.create_subfolder("report", outdir)
        #final_dir = outdir + '/report/cluster'
        final_dir = functions.create_subfolder("cluster", outdir_report)
    else:
        final_dir = outdir

    ## compare
    name = 'cluster_' + str(HCGB_time.create_human_timestamp())
    tag_cluster_info = final_dir + '/' + name
    print('+ Saving results in folder: ', final_dir)
    print('\tFile name: ', name)
    (DataMatrix, labeltext) = min_hash_caller.compare(siglist_all,
                                                      tag_cluster_info, Debug)

    ## get colorLabels

    ## plot images
    pdf = True
    cluster_returned = min_hash_caller.plot(DataMatrix, labeltext,
                                            tag_cluster_info, pdf, colorLabels)

    ## generate newick tree
    min_hash_caller.get_Newick_tree(cluster_returned, DataMatrix, labeltext,
                                    tag_cluster_info)

    return ()
def main():
	## control if options provided or help
	if len(sys.argv) > 1:
		print ("")
	else:
		help_options()
		exit()    	

	file1 = os.path.abspath(argv[1])
	file2 = os.path.abspath(argv[2])
	sample = argv[3]
	SPADES_bin = argv[4]
	threads = int(argv[5])
	path = 	argv[6]

	folder = HCGB_files.create_subfolder(sample, path)

	## assembly main 
	path_to_contigs = run_SPADES_assembly(folder, file1, file2, sample, SPADES_bin, threads, debug=True)
	
	## assembly plasmids
	path_to_plasmids = run_SPADES_plasmid_assembly(folder, file1, file2, sample, SPADES_bin, threads)

	## discard plasmids from main
	tmp_contigs, tmp_plasmids = discardPlasmids(path_to_contigs, path_to_plasmids, folder, sample)
	
	## rename fasta sequences
	new_contigs_list = tmp_contigs.split(".tmp")
	new_contigs = new_contigs_list[0]
	rename_contigs(tmp_contigs, "scaffolds_chr", new_contigs)
	
	new_plasmids=""
	if os.path.isfile(tmp_plasmids):
		new_plasmids_list = tmp_plasmids.split(".tmp")
		new_plasmids = new_plasmids_list[0]
		rename_contigs(tmp_plasmids, "scaffolds_plasmids", new_plasmids)
	
	
	## generate contig statistics
	print ('+ Get assembly statistics:...\n')

	## get contig statistics	
	contig_out = contig_stats(new_contigs, True)	
	contig_out_file = open(contig_out, 'r')
	contig_out_file_read = contig_out_file.read()
	contig_out_file.close()
	
	## dump in screen
	print (contig_out_file_read)
	print ()	

	if (new_plasmids == 'FAIL'):
		print ('+ No plasmids identified...\n')
	else:
		print ('+ Plasmids assembly')
		plasmid_out = contig_stats(new_plasmids, True)	

		## dump in screen
		plasmid_out_file = open(plasmid_out, 'r')
		plasmid_file_read = plasmid_out_file.read()
		plasmid_out_file.close()
		print(plasmid_file_read)