Esempio n. 1
0
def run_assembly(options):
    """Main function of the assemble module.
	
	It assembles each sample using SPADES_ and checks quality using BUSCO_ software and database.

	
	.. seealso:: This function depends on other BacterialTyper and HCGB functions called:
	
		- :func:`BacterialTyper.scripts.BUSCO_caller.print_help_BUSCO`
	
		- :func:`BacterialTyper.scripts.multiQC_report.multiqc_help`
		
		- :func:`BacterialTyper.modules.qc.BUSCO_check`
			
		- :func:`HCGB.sampleParser`
		
		- :func:`HCGB.functions.aesthetics_functions`
		
		- :func:`HCGB.functions.time_functions`
	
		- :func:`HCGB.functions.main_functions`
		
		- :func:`HCGB.functions.file_functions`
		
	.. include:: ../../links.inc	 	
	
	"""

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_info.help_fastq_format()
        exit()
    elif (options.help_BUSCO):
        ## information for BUSCO
        BUSCO_caller.print_help_BUSCO()
        exit()
    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()
        exit()

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Assembly module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "trim", ['_trim'], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "assemble",
                                            options.debug)

    ### call assemble using spades
    start_time_partial = start_time_total
    start_time_partial_assembly = start_time_partial

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        HCGB_aes.debug_message("options.threads: " + str(options.threads),
                               "yellow")
        HCGB_aes.debug_message("max_workers: " + str(max_workers_int),
                               "yellow")
        HCGB_aes.debug_message("cpu_here: " + str(threads_job), "yellow")

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    # We can use a with statement to ensure threads are cleaned up promptly
    print('+ Running modules SPADES...')
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        ## send for each sample
        commandsSent = {
            executor.submit(check_sample_assembly, name, outdir_dict[name],
                            sorted(cluster["sample"].tolist()), threads_job):
            name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## functions.timestamp
    print("\n+ Assembly of all samples finished: ")
    start_time_partial = HCGB_time.timestamp(start_time_partial_assembly)

    ##
    if (assembly_stats):
        ###################
        if Debug:
            HCGB_aes.debug_message("assembly_stats dictionary", "yellow")
            print(assembly_stats)

        ## create single file
        get_assembly_stats_all(assembly_stats, outdir, Debug)

    ### symbolic links
    print("+ Retrieve all genomes assembled...")

    ### BUSCO check assembly
    if (options.no_BUSCO):
        print()
    else:
        results = qc.BUSCO_check(outdir, outdir, options, start_time_partial,
                                 "genome")

    ## print to file results
    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Assembly module.")
    return ()
Esempio n. 2
0
def ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases,
                start_time_partial):
    HCGB_aes.boxymcboxface("ARIBA Identification")

    ##################
    ## check status	##
    ##################
    databases2use = []  ## path, db name
    card_trick_info = ""
    print('+ Check databases status: ')
    for index, db2use in retrieve_databases.iterrows():
        ## index_name
        if (db2use['source'] == 'ARIBA'):
            index_status = ariba_caller.check_db_indexed(db2use['path'], 'YES')
            if (index_status == True):
                #print (colored("\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green'))
                databases2use.append([db2use['path'], db2use['db']])

                ## prepare card database ontology for later
                if (db2use['db'] == 'card'):
                    card_trick_info = card_trick_caller.prepare_card_data(
                        options.database)

        ## check status of other databases if any
        # else:

    ## debug message
    if (Debug):
        print(colored("**DEBUG: databases2use\n**", 'yellow'))
        print(databases2use)
        if (card_trick_info):
            print(
                colored("**DEBUG: card_trick_info: " + card_trick_info + " **",
                        'yellow'))

    ######################################################
    ## Start identification of samples
    ######################################################
    print("\n+ Send ARIBA identification jobs...")

    ## get outdir folders
    outdir_samples = pd.DataFrame(columns=('sample', 'dirname', 'db',
                                           'output'))

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    for name, cluster in sample_frame:
        for db2use in databases2use:
            tmp = get_outfile(outdir_dict[name], name, db2use[0])
            outdir_samples.loc[len(outdir_samples)] = (name, outdir_dict[name],
                                                       db2use[1], tmp)

    ## multi-index
    outdir_samples = outdir_samples.set_index(['sample', 'db'])

    ## debug message
    if (Debug):
        print(colored("**DEBUG: outdir_samples **", 'yellow'))
        print(outdir_samples)

    ######################################################
    ## send for each sample
    ######################################################
    ## ariba assembly cutoff
    if not (options.ARIBA_cutoff):
        options.ARIBA_cutoff = 0.90

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## loop
    results_df = pd.DataFrame()
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        for db2use in databases2use:
            print(colored("+ Working with database: " + db2use[1], 'yellow'))
            ## send for each sample
            commandsSent = {
                executor.submit(
                    ariba_run_caller,
                    db2use[0],
                    db2use[1],  ## database path & dbname
                    sorted(cluster["sample"].tolist()),  ## files
                    outdir_samples.loc[(name, db2use[1]), 'output'],  ## output
                    threads_job,
                    options.ARIBA_cutoff): name
                for name, cluster in sample_frame
            }

            for cmd2 in concurrent.futures.as_completed(commandsSent):
                details = commandsSent[cmd2]
                try:
                    data = cmd2.result()
                except Exception as exc:
                    print('***ERROR:')
                    print(cmd2)
                    print('%r generated an exception: %s' % (details, exc))

            print("+ Jobs finished for database %s ..." % db2use[1])

            ## functions.timestamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

            print()
            print(
                "+ Collecting information for each sample analyzed for database: "
                + db2use[1])
            ## check results for each database
            results_df_tmp = virulence_resistance.check_results(
                db2use[1], outdir_samples, options.ARIBA_cutoff,
                card_trick_info)
            results_df = pd.concat([results_df, results_df_tmp])

            ## functions.timestamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

    ######################################################
    ## Generate final report for all samples
    ######################################################
    ## ariba summary results all samples
    print(
        "\n + Generate a summary file for all samples and one for each database employed..."
    )

    ## parse results
    if Project:
        final_dir = input_dir + '/report/profile'
        HCGB_files.create_folder(final_dir)
    else:
        final_dir = os.path.abspath(options.output_folder)

    ##
    vfdb = False
    subfolder = HCGB_files.create_subfolder("ariba_summary", final_dir)
    ## subfolder_samples = functions.create_subfolder("samples", final_dir) ## TODO: Copy all xlsx files to a common folder. Is it necessary?

    ## open excel writer
    name_excel = final_dir + '/profile_summary.xlsx'
    writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')

    for database, data in outdir_samples.groupby(level='db'):  ## fix
        report_files_databases = {}

        for sample, data2 in data.groupby(level='sample'):  ## fix
            file_report = data2.loc[sample, database]['output'] + '/report.tsv'
            if os.path.isfile(file_report):  ## check if exists
                report_files_databases[sample] = file_report

        outfile_summary = subfolder + "/"
        if database.endswith('card_prepareref/'):
            outfile_summary = outfile_summary + 'CARD_summary'
            name_db = 'CARD'
        elif database.endswith('vfdb_full_prepareref/'):
            outfile_summary = outfile_summary + 'VFDB_summary'
            name_db = 'VFDB'
            vfdb = True
        else:
            ## TODO: check if there are multiple 'other' databases
            ## Different databases provided (different to VFDB and CARD) would collapse file
            outfile_summary = outfile_summary + 'Other_summary'
            name_db = 'other'

        ## call ariba summary to summarize results
        csv_all = ariba_caller.ariba_summary_all(outfile_summary,
                                                 report_files_databases)
        if not csv_all == 'NaN':
            csv2excel = pd.read_csv(csv_all, header=0, sep=',')
            ## write excel
            name_tab = name_db + '_found'
            csv2excel.to_excel(writer, sheet_name=name_tab)

    ## results_df contains excel and csv files for each sample and for each database
    list_databases = set(results_df['database'].to_list())
    for db in list_databases:
        df_db = results_df[results_df['database'] == db]['csv']
        dict_samples = df_db.to_dict()

        merge_df = pd.DataFrame()
        for sample in dict_samples:

            if os.path.isfile(dict_samples[sample]):
                df = pd.read_csv(dict_samples[sample], header=0, sep=",")
                df = df.set_index('Genes')
                df2 = df.rename(columns={'Status': sample}, inplace=True)
                df2 = df[[sample]]

                ## add to a common dataframe
                merge_df = pd.concat([merge_df, df2], axis=1, sort=True)
                merge_df.fillna("NaN", inplace=True)

        trans_df = merge_df.transpose()
        ## write excel
        name_tab = db + '_all'
        trans_df.to_excel(writer, sheet_name=name_tab)

    ## close
    writer.save()

    ######################################################
    ## print additional information for VFDB
    ######################################################
    if (vfdb):
        print("\n\n")
        HCGB_aes.print_sepLine("*", 50, False)
        print("+ Check VFDB details in files downloaded from vfdb website:")
        files_VFDB = virulence_resistance.check_VFDB(final_dir +
                                                     '/VFDB_information')
        HCGB_aes.print_sepLine("*", 50, False)

    ######################################################
    print("\n+ Please check additional summary files generated at folder ",
          final_dir)
    print("+ Go to website: https://jameshadfield.github.io/phandango/#/")
    print(
        "+ For each database upload files *phandango.csv and *phandango.tre and visualize results"
    )
Esempio n. 3
0
def KMA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases,
              time_partial):
    """Kmer identification using software KMA_.
	
	:param options: options passed to the :func:`BacterialTyper.modules.ident.run_ident` main function (threads, KMA_cutoff, etc). See details in...
	:param pd_samples_retrieved: pandas dataframe for samples to process.
	:param outdir_dict: dictionary containing information for each sample of the output folder for this process.
	:param retrieve_databases: 
	:param time_partial: timestamp of start time of the process.
	
	:type options: 
	:type pd_samples_retrieved: pandas.DataFrame()
	:type outdir_dict: Dictionary
	:type retrieve_databases: pandas.DataFrame()
	:type time_partial: 
	
	:return: Information of the identification. See example below.
	:rtype: pandas.DataFrame()
	
	See example of returned dataframe in file :file:`/devel/results/KMA_ident_example.csv` here:
	
	.. include:: ../../devel/results/KMA_ident_example.csv
		:literal:
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.config.set_config.get_exe`
	
		- :func:`BacterialTyper.scripts.functions.boxymcboxface`
		
		- :func:`BacterialTyper.modules.ident.send_kma_job`
		
		- :func:`BacterialTyper.modules.ident.get_outfile`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed`
	
		- :func:`BacterialTyper.scripts.species_identification_KMA.parse_kma_results`
	
		
	.. include:: ../../links.inc	
	
	"""

    return (pd.DataFrame())

    ### print header
    HCGB_aes.boxymcboxface("KMA Identification")

    ## set defaults
    kma_bin = set_config.get_exe("kma")

    ## check status
    databases2use = []
    for index, db2use in retrieve_databases.iterrows():
        ## index_name
        if (str(db2use['source']).startswith('KMA')):
            print('+ Check database: ' + db2use['db'])
            fold_name = os.path.dirname(db2use['path'])

            index_status = species_identification_KMA.check_db_indexed(
                db2use['path'], fold_name)
            if (index_status == True):
                print(
                    colored(
                        "\t+ Databases %s seems to be fine...\n\n" %
                        db2use['db'], 'green'))
                databases2use.append(db2use['path'])
            else:
                #databases2use.remove(db2use)
                print(
                    colored(
                        "\t**Databases %s is not correctly indexed. Not using it...\n"
                        % db2use['db'], 'red'))

    ## debug message
    if (Debug):
        print(
            colored(
                "**DEBUG: databases2use\n" + "\n".join(databases2use) + "\n**",
                'yellow'))

    ## Start identification of samples
    print("\n+ Send KMA identification jobs...")

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        for db2use in databases2use:

            ## load database on memory
            print("+ Loading database on memory for faster identification.")
            return_code_load = species_identification_KMA.load_db(
                kma_bin, db2use)
            ## send for each sample
            commandsSent = {
                executor.submit(send_kma_job, outdir_dict[name],
                                sorted(cluster["sample"].tolist()), name,
                                db2use, threads_job, Debug): name
                for name, cluster in sample_frame
            }

            for cmd2 in concurrent.futures.as_completed(commandsSent):
                details = commandsSent[cmd2]
                try:
                    data = cmd2.result()
                except Exception as exc:
                    print('***ERROR:')
                    print(cmd2)
                    print('%r generated an exception: %s' % (details, exc))

            ## remove database from memory
            print("+ Removing database from memory...")
            return_code_rm = species_identification_KMA.remove_db(
                kma_bin, db2use)

            if (return_code_rm == 'FAIL'):
                print(
                    colored(
                        "***ERROR: Removing database from memory failed. Please do it manually! Execute command: %s"
                        % cmd_rm_db, 'red'))

            ## functions.timestamp
            time_partial = HCGB_time.timestamp(time_partial)

    ## parse results
    print("+ KMA identification call finished for all samples...")
    print("+ Parse results now")
    results_summary = pd.DataFrame()
    for db2use in databases2use:
        ### [TODO]: parse data according to database: bacteria, plasmids or user data or genbank data provided

        basename_db = os.path.basename(db2use)
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)

        ###
        for name, cluster in sample_frame:

            ## get result
            ## outdir_KMA
            outdir_dict_kma = HCGB_files.create_subfolder(
                "kma", outdir_dict[name])
            result = get_outfile(outdir_dict_kma, name, db2use)
            #print ('\t- File: ' + result + '.spa')

            ## get results using a cutoff value [Defaulta: 80]
            results = species_identification_KMA.parse_kma_results(
                result + '.spa', options.KMA_cutoff)
            results['Database'] = basename_db

            ### check if db2use is plasmids as it could be several.
            if (results.index.size > 1):
                if (basename_db == "plasmids.T" or basename_db == "viral.TG"):
                    ## let it be several entries
                    results['Sample'] = name
                    results_summary = results_summary.append(results,
                                                             ignore_index=True)
                else:
                    print(
                        colored("###########################################",
                                'yellow'))
                    print(
                        colored("Sample %s contains multiple strains." % name,
                                'yellow'))
                    print(
                        colored("###########################################",
                                'yellow'))
                    print(colored(results, 'yellow'))
                    print('\n\n')

                    ## add both strains if detected
                    results['Sample'] = name
                    results_summary = results_summary.append(results,
                                                             ignore_index=True)

                    ## TODO: add multi-isolate flag

            elif (results.index.size == 1):  ## 1 clear reference
                results['Sample'] = name
                results_summary = results_summary.append(results,
                                                         ignore_index=True)

            else:
                print(
                    colored(
                        '\tNo clear strain from database %s has been assigned to sample %s'
                        % (basename_db, name), 'yellow'))
                ## add empty line if no available
                results['Sample'] = name
                results_summary = results_summary.append(results,
                                                         ignore_index=True)

    print("+ Finish this step...")

    ## debug message
    if (Debug):
        results_summary.to_csv(quotechar='"')

    return (results_summary)
Esempio n. 4
0
def map_samples(options, reference_gbk_file, input_dir, outdir):    
    """
    """

    ## set it as variable
    contig_option = False
    
    pd_samples_retrieved_merge = pd.DataFrame()
    pd_samples_retrieved = pd.DataFrame()

    ## all_data // only_project_data
    if (options.all_data or options.only_project_data):
        ## get files to map
        pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "trim", ['_trim'], options.debug)
        
        ## discard the sample used as reference if any
        if options.project_sample_ID:
            pd_samples_retrieved = pd_samples_retrieved.drop(index=options.project_sample_ID)
    
        ## create output directories
        outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "phylo", options.debug)
    
    ####################################
    ## user_data // genbank_data // only_external_data
    if (options.all_data or options.user_data): 
        
        ## --------------------------- ##
        ### get user database
        ## --------------------------- ##
        print ("+ Retrieve samples to map from user database...")
        db_frame_user_Data = database_user.get_userData_files(options, os.path.join(options.database, 'user_data'))

        ## discard the sample used as reference if any
        if options.user_sample_ID:
            #db_frame_user_Data = pd_samples_retrieved.drop(index=options.user_sample_ID)
            db_frame_user_Data = db_frame_user_Data.drop(index=options.user_sample_ID) ## Why not this?
            
        ## create output directories in database entries in user_data
        outdir_dict2 = HCGB_files.outdir_subproject(os.path.join(options.database, 'user_data'), db_frame_user_Data, "phylo")
        
        ## If user desires to map contigs, map trimmed as default
        if (contig_option):
            # filter for assemblies retrieved
            db_frame_user_Data = db_frame_user_Data.loc[db_frame_user_Data['tag'] == "assembly",]
        else:
             # filter for raw reads
             db_frame_user_Data = db_frame_user_Data.loc[db_frame_user_Data['tag'] == "reads",]
        
        ## merge if both contain data
        if not pd_samples_retrieved.empty:
            pd_samples_retrieved_merge = pd.concat([db_frame_user_Data, pd_samples_retrieved], sort=True, ignore_index=True).drop_duplicates()
            outdir_dict.update(outdir_dict2)
        else:
            outdir_dict = outdir_dict2
            pd_samples_retrieved_merge = db_frame_user_Data   
    
        ## --------------------------- ##
        ### get genbank database
        ## --------------------------- ##
        ## set contig option for these data only
    
    ## check data
    try:
        if db_frame_user_Data.empty:
            print ()   
    except:
        pd_samples_retrieved_merge = pd_samples_retrieved   
   
    ## debug message
    if (Debug):
        print (colored("**DEBUG: pd_samples_retrieved_merge **", 'yellow'))
        print (pd_samples_retrieved_merge)
        print (colored("**DEBUG: outdir_dict **", 'yellow'))
        print (outdir_dict)
        
    ## TODO: use contig option
    if (contig_option or options.genbank_data):
        print ()
        
    ####################################
    ## for fastq samples
    ####################################

    # optimize threads
    name_list = set(pd_samples_retrieved_merge["name"].tolist())
    threads_job = HCGB_main.optimize_threads(options.threads, len(name_list)) ## threads optimization
    max_workers_int = int(options.threads/threads_job)

    ## debug message
    if (options.debug):
        print (colored("**DEBUG: options.threads " +  str(options.threads) + " **", 'yellow'))
        print (colored("**DEBUG: max_workers " +  str(max_workers_int) + " **", 'yellow'))
        print (colored("**DEBUG: cpu_here " +  str(threads_job) + " **", 'yellow'))

    ## call snippy
    print ("\n+ Create mapping of fastq reads for project samples:")
    
    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved_merge.groupby(["name"])
    
    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers_int) as executor:
        commandsSent = { executor.submit(snippy_variant_caller, reference_gbk_file, sorted(cluster["sample"].tolist()), threads_job, outdir_dict[name], options.name, contig_option, options.other_options, name, options.debug): name for name, cluster in sample_frame }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print ('***ERROR:')
                print (cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## subfolder within phylo for this mapping
    new_outdir_dict = {}
    for key,value in outdir_dict.items():
        tag = os.path.join(value, key + '_vs_' + options.name)
        new_outdir_dict[key] = tag 

    return (new_outdir_dict)
Esempio n. 5
0
def run_annotation(options):

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_BUSCO):
        ## information for BUSCO
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()

    elif (options.help_Prokka):
        ## information for Prokka
        annotation.print_list_prokka()
        exit()

    ## set default
    options.batch = False

    ###
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Assembly annotation")

    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ### symbolic links
    print("+ Retrieve all genomes assembled...")

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)

    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "annot",
                                            options.debug)

    ## annotate
    print("+ Annotate assemblies using prokka:")
    print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode")
    if options.genera == 'Other':
        print(
            "\t-Option: genera = Off; No genus-specific BLAST databases option provided"
        )
    else:
        print("\t-Option: genera = ", options.genera,
              "; Genus-specific BLAST databases option provided")

    print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature")
    print("\t-Option: addmrna;  Add 'mRNA' features for each 'CDS' feature")
    print("\t-Option: cdsrnaolap;  Allow [tr]RNA to overlap CDS")

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(annot_caller, row['sample'],
                            outdir_dict[row['name']], options, row['name'],
                            threads_job): index
            for index, row in pd_samples_retrieved.iterrows()
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get folders
    givenList = [v for v in outdir_dict.values()]
    protein_files = []
    print(
        "+ Detail information for each sample could be identified in separate folders:"
    )
    for folder in givenList:
        print('\t + ', folder)
        protein_files.extend(
            HCGB_main.retrieve_matching_files(folder, '.faa', Debug))

    ### report generation
    if (options.skip_report):
        print("+ No annotation report generation...")
    else:
        ### report generation
        HCGB_aes.boxymcboxface("Annotation report")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        PROKKA_report = HCGB_files.create_subfolder("annotation",
                                                    outdir_report)
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % PROKKA_report)

        ## check if previously report generated
        filename_stamp = PROKKA_report + '/.success'
        done = 0
        if os.path.isdir(PROKKA_report):
            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous report generated results on: %s" % stamp,
                        'yellow'))
                done = 1

        ## generate report
        if done == 0:
            ## get subdirs generated and call multiQC report module
            multiQC_report.multiQC_module_call(givenList, "Prokka",
                                               PROKKA_report, "-dd 2")
            print(
                '\n+ A summary HTML report of each sample is generated in folder: %s'
                % PROKKA_report)

            ## success stamps
            filename_stamp = PROKKA_report + '/.success'
            stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## time stamp
    start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total)

    ## Check each annotation using BUSCO
    results = qc.BUSCO_check(input_dir, outdir, options,
                             start_time_partial_BUSCO, "proteins")

    ## print to file: results

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Annotation module.")
    return ()
Esempio n. 6
0
def run(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_info.help_fastq_format()
        exit()
    elif (options.help_trimm_adapters):
        ## help on trimm adapters
        trimmomatic_call.print_help_adapters()
        exit()
    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Trimming samples")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    if (options.detached):
        options.project = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
        options.debug)

    ## debug message
    if (Debug):
        HCGB_aes.debug_message("pd_samples_retrieved", 'yellow')
        HCGB_main.print_all_pandaDF(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "trimm",
                                            options.debug)

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print("+ Trimming adapters for each sample retrieved...")

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    # Trimming adapters
    if (options.adapters):
        # Adapter file provided
        options.adapters = os.path.abspath(options.adapters)
        print("\t- Adapters file provided...")
    else:
        # Get default adpaters file
        print("\t- Default Trimmomatic adapters (v0.39) will be used...")
        options.adapters = data_files.data_list(
            "available_Trimmomatic_adapters")

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(trimmo_caller, sorted(cluster["sample"].tolist()),
                            outdir_dict[name], name, threads_job, Debug,
                            options.adapters): name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    print("\n\n+ Trimming samples has finished...")
    ## functions.timestamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get files generated and generate symbolic link
    if not options.project:
        dir_symlinks = HCGB_files.create_subfolder('link_files', outdir)
        files2symbolic = []
        folders = os.listdir(outdir)

        ## debug message
        if (Debug):
            print(
                colored(
                    "**DEBUG: generate symbolic links for each file in " +
                    dir_symlinks + "**", 'yellow'))

        for fold in folders:
            if fold.endswith(".log"):
                continue
            else:
                this_folder = outdir + '/' + fold
                subfiles = os.listdir(this_folder)
                for files in subfiles:
                    files_search = re.search(
                        r".*trim_R\d{1}.*",
                        files)  ## only paired-end. Todo: single end
                    if files_search:
                        files2symbolic.append(this_folder + '/' + files)

        HCGB_files.get_symbolic_link(files2symbolic, dir_symlinks)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print("\n+ Generating a report using MultiQC module.")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        ## call multiQC report module
        givenList = [v for v in outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            HCGB_aes.debug_message("my_outdir_list for multiqc report",
                                   "yellow")
            print(my_outdir_list)
            print("\n")

        trimm_report = HCGB_files.create_subfolder("trimm", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "Trimmomatic",
                                           trimm_report, "")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % trimm_report)

        ## create fastqc for trimmed reads
        pd_samples_retrieved_trimmed = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)
        qc.fastqc(pd_samples_retrieved_trimmed, outdir, options,
                  start_time_partial, "trimmed", Debug)

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)
    print("\n+ Exiting trimm module.")
    return ()
Esempio n. 7
0
def run_biotype(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_XICRA.help_fastq_format()
    elif (options.help_project):
        ## information for project
        help_XICRA.project_help()
        exit()
    elif (options.help_RNAbiotype):
        ## information for join reads
        RNAbiotype.help_info()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    aesthetics_functions.pipeline_header('XICRA')
    aesthetics_functions.boxymcboxface("RNA biotype analysis")
    print("--------- Starting Process ---------")
    time_functions.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    if (options.detached):
        outdir = os.path.abspath(options.output_folder)
        options.project = False
    else:
        options.project = True
        outdir = input_dir

    ## get files
    print('+ Getting files from input folder... ')

    ## get files
    if options.noTrim:
        print('+ Mode: fastq.\n+ Extension: ')
        print("[ fastq, fq, fastq.gz, fq.gz ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
            options.debug)

    else:
        print('+ Mode: trim.\n+ Extension: ')
        print("[ _trim_ ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)

        ## Discard if joined reads: use trimmed single-end or paired-end
        pd_samples_retrieved = pd_samples_retrieved[
            pd_samples_retrieved['ext'] != '_joined']

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        files_functions.create_folder(outdir)

    ## for samples
    mapping_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "map", options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_outdir_dict **", 'yellow'))
        print(mapping_outdir_dict)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_total)

    ## optimize threads
    name_list = set(pd_samples_retrieved["new_name"].tolist())
    threads_job = main_functions.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ##############################################
    ## map Reads
    ##############################################
    start_time_partial = mapReads_module(options, pd_samples_retrieved,
                                         mapping_outdir_dict, options.debug,
                                         max_workers_int, threads_job,
                                         start_time_partial, outdir)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_results **", 'yellow'))
        print(mapping_results)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    ## for samples
    biotype_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "biotype",
        options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: biotype_outdir_dict **", 'yellow'))
        print(biotype_outdir_dict)

    ## get RNAbiotype information
    RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict,
                                      options.annotation, options.debug,
                                      max_workers_int, threads_job)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print(
            "\n+ Generating a report using MultiQC module for featureCount analysis."
        )
        outdir_report = files_functions.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders:"
        )

        ## call multiQC report module
        givenList = [v for v in biotype_outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            print(
                colored("\n**DEBUG: my_outdir_list for multiqc report **",
                        'yellow'))
            print(my_outdir_list)
            print("\n")

        featureCount_report = files_functions.create_subfolder(
            "featureCount", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "featureCount",
                                           featureCount_report, "-dd 2")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % featureCount_report)

        ### Summarizing RNA biotype information
        biotype_report = files_functions.create_subfolder(
            "biotype", outdir_report)
        single_files_biotype = files_functions.create_subfolder(
            "samples", biotype_report)

        ## results
        dict_files = {}

        for samples in biotype_outdir_dict:
            featurecount_file = os.path.join(biotype_outdir_dict[samples],
                                             'featureCount.out.tsv')
            if files_functions.is_non_zero_file(featurecount_file):
                dict_files[samples] = featurecount_file
            ## copy pdf
            pdf_plot = main_functions.retrieve_matching_files(
                biotype_outdir_dict[samples], '.pdf', options.debug)
            if files_functions.is_non_zero_file(pdf_plot[0]):
                shutil.copy(pdf_plot[0], single_files_biotype)

        ## collapse all information
        all_data = RNAbiotype.generate_matrix(dict_files)

        ## print into excel/csv
        print('+ Table contains: ', len(all_data), ' entries\n')

        ## debugging messages
        if Debug:
            print("** DEBUG: all_data")
            print(all_data)

        ## set abs_csv_outfile to be in report folder
        ## copy or link files for each sample analyzed
        abs_csv_outfile = os.path.join(biotype_report, "summary.csv")
        all_data.to_csv(abs_csv_outfile)

        ## create plot: call R [TODO: implement in python]
        outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf")

        ## R scripts
        biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum',
                                           options.debug)
        rscript = set_config.get_exe("Rscript", options.debug)
        cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script,
                                            abs_csv_outfile, outfile_pdf)

        ##
        print("+ Create summary plot for all samples")
        callCode = system_call_functions.system_call(cmd_R_plot)

    print("\n*************** Finish *******************")
    start_time_partial = time_functions.timestamp(start_time_total)
    print("\n+ Exiting join module.")
    return ()
Esempio n. 8
0
def fastqc(pd_samples_retrieved, outdir, options, start_time_total,
           name_analysis, Debug):

    HCGB_aes.boxymcboxface("FASTQC Quality check for samples")

    ## debug message
    if (Debug):
        print(colored("\n**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)
        print("\n")

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")

    ## if not project, outdir contains the dir to put output
    ## in this case, in some other cases might not occur
    if not options.project:
        functions.create_folder(outdir)
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved,
                                            "fastqc_" + name_analysis,
                                            options.debug)

    print("+ Checking quality for each sample retrieved...")
    start_time_partial = start_time_total

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        HCGB_aes.debug_message("options.threads: " + str(options.threads),
                               "yellow")
        HCGB_aes.debug_message("max_workers: " + str(max_workers_int),
                               "yellow")
        HCGB_aes.debug_message("threads_job: " + str(threads_job), "yellow")

    ## send for each sample
    print("+ Calling fastqc for samples...")
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=int(max_workers_int)) as executor:
        commandsSent = {
            executor.submit(fastqc_caller.run_module_fastqc, outdir_dict[name],
                            sorted(cluster["sample"].tolist()), name,
                            threads_job): name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    print("+ FASTQC for samples has finished...")

    ## functions.timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print("\n+ Generating a report using MultiQC module.")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders:"
        )

        ## call multiQC report module
        givenList = [v for v in outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            print(
                colored("\n**DEBUG: my_outdir_list for multiqc report **",
                        'yellow'))
            print(my_outdir_list)
            print("\n")

        fastqc_report = HCGB_files.create_subfolder("FASTQC", outdir_report)
        fastqc_final_report = HCGB_files.create_subfolder(
            name_analysis, fastqc_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "FASTQC",
                                           fastqc_final_report, "")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % fastqc_final_report)

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting qc module.")
    exit()
Esempio n. 9
0
def update_database_user_data(database_folder, project_folder, Debug, options):
    """
	Updates user_data folder within the database folder provided.
	
	It would generate single subfolder for each sample previously analyzed and it would store main information and result files for later interpretation, comparison and/or summarization with new samples analyzed.
	
	:param database_folder:
	:param project_folder:
	:param Debug:
	:param options:
	
	:type database_folder:
	:type project_folder:
	:type Debug:
	:type options:
	
	:returns: Updated database result from :func:`BacterialTyper.scripts.database_generator.update_db_data_file`.
	:rtype: Dataframe
	
	:warnings: Returns **FAIL** if check process failed.
	
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.files_functions.create_subfolder`
		
		- :func:`HCGB.functions.main_functions.functions.get_data`
		
		- :func:`HCGB.functions.main_functions.optimize_threads`
		
		- :func:`BacterialTyper.scripts.database_user.get_userData_files`
		
		- :func:`BacterialTyper.scripts.database_user.update_sample`
		
		- :func:`BacterialTyper.scripts.database_generator.getdbs`
		
		- :func:`BacterialTyper.scripts.database_generator.get_database`
		
		- :func:`BacterialTyper.scripts.database_generator.update_db_data_file`

	"""

    print("\n+ Updating information from user data folder: ", project_folder)

    ## create folder
    own_data = HCGB_files.create_subfolder("user_data", database_folder)

    ## Default missing options
    options.project = True
    options.debug = Debug
    if not options.single_end:
        options.pair = True

    ####################################
    ## get information
    ####################################

    ## get user data files
    project_data_df = get_userData_files(options, project_folder)

    ## get user data info
    project_info_df = get_userData_info(options, project_folder)

    ## merge data
    project_all_data = pd.concat([project_data_df, project_info_df],
                                 join='outer',
                                 sort=True).drop_duplicates()
    #project_all_data.index.name = 'name'

    ## debug messages:
    if Debug:
        HCGB_aes.debug_message("project_data_df", 'yellow')
        print(project_data_df)

        HCGB_aes.debug_message("project_info_df", 'yellow')
        print(project_info_df)

        HCGB_aes.debug_message("project_all_data", 'yellow')
        print(project_all_data)

    print('\n+ Get database information')
    db_frame = database_generator.getdbs('user_data', database_folder,
                                         'user_data', Debug)
    user_data_db = database_generator.get_database(db_frame, Debug)

    ## merge dataframe
    sample_frame = project_all_data.groupby("name")

    ####################################
    ## optimize threads
    ####################################
    name_list = project_all_data.index.values.tolist()
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print('\n+ Updating information using %s threads and %s parallel jobs' %
          (options.threads, max_workers_int))

    ####################################
    ## loop through frame using multiple threads
    ####################################
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        ## send for each
        commandsSent = {
            executor.submit(update_sample, name, cluster, own_data,
                            user_data_db, Debug): name
            for name, cluster in sample_frame
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    HCGB_aes.print_sepLine("+", 75, False)
    print("\n+ Retrieve information ...")

    ####################################
    ###### populate dataframe
    ####################################
    for name, cluster in sample_frame:
        ###### dump to file
        info_file = own_data + '/' + name + '/info.txt'
        if os.path.exists(info_file):
            dataGot = HCGB_main.get_data(info_file, ',', 'index_col=0')
            dataGot = dataGot.set_index('ID')

            if (options.debug):
                print(colored("**DEBUG: dataGot dataframe **", 'yellow'))
                print(dataGot)

            user_data_db = pd.concat([user_data_db, dataGot],
                                     join='outer',
                                     sort=True).drop_duplicates()
            ## concatenating by outer we get all available entries

    if (options.debug):
        print(colored("**DEBUG: user_data_db dataframe **", 'yellow'))
        print(user_data_db)

    HCGB_aes.print_sepLine("+", 75, False)

    ####################################
    ## update db
    ####################################
    database_csv = own_data + '/user_database.csv'

    dataUpdated = database_generator.update_db_data_file(
        user_data_db, database_csv)
    print("+ Database has been generated: \n", database_csv)
    return (dataUpdated)