Esempio n. 1
0
def help_ARIBA():

	dict_ariba = citation.ariba_citation()
	## to do: finish filling information for different databases
	print ("")
	HCGB_aes.print_sepLine("*", 50, False)
	print ("CARD:")
	print ("The Comprehensive Antibiotic Resistance Database (CARD) is a rigorously curated collection of characterized, peer-reviewed  resistance determinants and associated antibiotics, organized by the Antibiotic Resistance Ontology (ARO) and AMR gene detection models.")
	print ('Citation:', dict_ariba['CARD'])
	print ("")	
	HCGB_aes.print_sepLine("*", 50, False)
	print ("VFDB:")
	print ("The virulence factor database (VFDB) is an integrated and comprehensive online resource for curating information about virulence factors of bacterial pathogens. Since its inception in 2004, VFDB has been dedicated to providing up-to-date knowledge of VFs from various medically significant bacterial pathogens.")
	print ('Citation:', dict_ariba['VFDB'])
	print ("")	
	HCGB_aes.print_sepLine("*", 50, False)
	print ("ARG-ANNOT:\n")
	print ("...")
	print ('Citation:', dict_ariba['ARG-ANNOT'])
	print ("")	
	HCGB_aes.print_sepLine("*", 50, False)
	print ("MEGARes:")
	print ("The MEGARes database contains sequence data for approximately 4,000 hand-curated antimicrobial resistance genes accompanied by an annotation structure that is optimized for use with high throughput sequencing.")
	print ('Citation:', dict_ariba['MEGARes'])
	print ("")	
	HCGB_aes.print_sepLine("*", 50, False)
	print ("PlasmidFinder:\n")
	print ("...")
	print ('Citation:', dict_ariba['PlasmidFinder'])
	print ("")	
	HCGB_aes.print_sepLine("*", 50, False)
	print ("ResFinder:\n")
	print ("The ResFinder database is a curated database of acquired resistance genes.")
	print ('Citation:', dict_ariba['ResFinder'])
	print ("")	
	HCGB_aes.print_sepLine("*", 50, False)
	print ("srst2:")
	print ("...")
	print ('Citation:', dict_ariba['srst2'])
	print ("")
	HCGB_aes.print_sepLine("*", 50, False)
	print ("")
def getdbs(source, database_folder, option, debug):
    """Get databases available within the folder provided.
	
	:param source: Type of database to search: ARIBA, KMA, NCBI, MLST, user_data
	:param database_folder: Absolute path to database folder.
	:param option: String containing multiple entries separated by '#' that indicate the type of database entries to search within each source type.
	:param debug: True/False for debugging messages.
	
	:type source: string
	:type database_folder: string
	:type option: string
	:type debug: bool
	
	:returns: Dataframe containing absolute paths to the available databases for each type requested. It contains columns for: "source", "db", "path"
		
	e.g.: 	source = KMA
			option = kma:archaea,plasmids,bacteria#kma_external:/path/to/file1,/path/to/file2#user_data#genbank **
			
	e.g.: 	source = NCBI
			option = genbank
	
	"""

    ## init dataframe
    colname = ["source", "db", "path"]
    db_Dataframe = pd.DataFrame(columns=colname)

    ## read folders within database
    if os.path.isdir(database_folder):
        files = os.listdir(database_folder)  ## ARIBA/KMA_db/genbank/user_data
    else:
        return db_Dataframe

    ## debug message
    if (debug):
        print(colored("Folders: " + str(files), 'yellow'))
        print()

    ## user input
    dbs2use = []
    option_list = option.split("#")

    for option_item in option_list:

        ## debug message
        if (debug):
            print(colored("Option item: " + option_item, 'yellow'))

        ###
        dbs2use_tmp = []

        ## kma
        if (option_item.startswith('kma')):
            if (option_item.startswith('kma:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

            elif (option_item.startswith('kma_external:')):
                external = option_item.split(":")[1].split(",")

                ## add to dataframe
                for ext in external:
                    name_ext = os.path.basename(ext)
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_External', name_ext, ext
                    ]

            elif (option_item.startswith('kma_user_data:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

            elif (option_item.startswith('kma_NCBI:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

        ### ARIBA
        elif (option_item.startswith('ARIBA:')):
            dbs2use = option_item.split(":")[1].split(",")

        ### NCBI: genbank
        elif (option_item.startswith('genbank')):
            dbs2use.append('genbank')

        ### NCBI: taxonomy ID
        elif (option_item.startswith('tax_id')):
            dbs2use.append('taxonomy_id')

        ### user_data
        elif (option_item.startswith('user_data')):
            dbs2use.append('user_data')

        ### MLST
        elif (option_item.startswith('MLST')):
            dbs2use_tmp = option_item.split(":")[1].split(",")

        ### Mash
        elif (option_item.startswith('Mash')):
            if (option_item.startswith('Mash_external_data:')):
                external = option_item.split(":")[1].split(",")
                ## add to dataframe
                for ext in external:
                    name_ext = os.path.basename(ext)
                    name_ext_ = name_ext.split('.fna')[0]
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'Mash_external', name_ext_, ext
                    ]
            else:
                dbs2use_tmp = option_item.split(":")[1].split(",")

        ### Other?
        else:
            dbs2use.append(
                option_item
            )  ## add ARIBA, user_data or genbank option if provided

        ## get all
        dbs2use = dbs2use + dbs2use_tmp

    ## debug message
    if (debug):
        print(colored("\ndbs2use:\n\t" + "\n\t".join(dbs2use), 'yellow'))

    ## init dataframe
    #colname = ["source", "db", "path"]
    #db_Dataframe  = pd.DataFrame(columns = colname)

    ###############
    #### ARIBA ####
    ###############
    if (source == 'ARIBA'):
        ### Check if folder exists
        ARIBA_folder = HCGB_files.create_subfolder('ARIBA', database_folder)

        ### get information
        ARIBA_dbs = ariba_caller.get_ARIBA_dbs(dbs2use)  ## get names
        for ariba_db in ARIBA_dbs:
            this_db = os.path.join(ARIBA_folder, ariba_db + '_prepareref')
            if os.path.exists(this_db):
                code_check_db = ariba_caller.check_db_indexed(this_db, 'NO')
                if (code_check_db == True):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'ARIBA', ariba_db, this_db
                    ]
                    print(
                        colored(
                            "\t- ARIBA: including information from database: "
                            + ariba_db, 'green'))
            else:
                print("+ Database: ", ariba_db, " is not downloaded...")
                print("+ Download now:")
                folder_db = HCGB_files.create_subfolder(ariba_db, ARIBA_folder)
                code_db = ariba_caller.ariba_getref(ariba_db, folder_db, debug,
                                                    2)  ## get names
                if (code_db == 'OK'):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'ARIBA', ariba_db, this_db
                    ]
                    print(
                        colored(
                            "\t- ARIBA: including information from database: "
                            + ariba_db, 'green'))

    #############
    #### KMA ####
    #############
    elif (source == 'KMA'):
        ### Check if folder exists
        KMA_db_abs = HCGB_files.create_subfolder('KMA_db', database_folder)
        kma_dbs = os.listdir(KMA_db_abs)

        ## debug message
        if (debug):
            print(colored("Folders KMA_db:" + str(kma_dbs), 'yellow'))

        ### get information
        for db in dbs2use:
            this_db = KMA_db_abs + '/' + db

            ## debug message
            if (debug):
                print(colored("this_db:" + this_db, 'yellow'))

            #### genbank
            if (db == "genbank"):
                ## KMA databases exists
                this_db_file = this_db + '/genbank_KMA'
                if os.path.isfile(this_db_file + '.comp.b'):
                    print(
                        colored(
                            "\t- genbank: including information from different reference strains available.",
                            'green'))  ## include data from NCBI
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_genbank', 'genbank', this_db_file
                    ]

            #### user_data
            elif (db == "user_data"):
                ## KMA databases exists
                this_db_file = this_db + '/userData_KMA'
                if os.path.isfile(this_db_file + '.comp.b'):
                    print(
                        colored(
                            "\t- user_data: including information from user previously generated results",
                            'green'))  ## include user data
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_user_data', 'user_data', this_db_file
                    ]

            ## default KMA databases: bacteria & plasmids
            else:
                ##
                if (db == 'plasmids'):
                    prefix = '.T'
                elif (db == 'viral'):
                    prefix = '.TG'
                else:
                    prefix = '.ATG'

                this_db_file = os.path.join(this_db, db, db + prefix)
                ## debug message
                if (debug):
                    print(colored("this_db_file:" + this_db_file, 'yellow'))

                if os.path.isfile(this_db_file + '.comp.b'):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_db', db, this_db_file
                    ]
                    print(
                        colored(
                            "\t- KMA: including information from database " +
                            db, 'green'))
                else:
                    print(
                        colored("\t**KMA: Database %s was not available." % db,
                                'red'))

                    ## if missing: call download module
                    print("+ Download missing KMA_db (%s) provided" % db)
                    species_identification_KMA.download_kma_database(
                        os.path.join(database_folder, 'KMA_db', db), db, debug)

                    if os.path.isfile(this_db_file + '.comp.b'):
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'KMA_db', db, this_db_file
                        ]
                        print(
                            colored(
                                "\t- KMA: including information from database "
                                + db, 'green'))
                    else:
                        print(
                            colored(
                                "\t**KMA: Database %s was not available." % db,
                                'red'))

    ##############
    #### NCBI ####
    ##############
    elif (source == 'NCBI'):

        ## TODO: get additional information from
        ## info_file = dir_path + '/info.txt'

        ### Check if folder exists
        path_genbank = os.path.join(database_folder, source, 'genbank')
        db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder)

        ### genbank entries downloaded
        if dbs2use[0] == 'genbank':
            ##
            if os.path.exists(path_genbank + '/bacteria'):
                genbank_entries = os.listdir(
                    os.path.join(path_genbank, 'bacteria'))
                for entry in genbank_entries:
                    this_db = os.path.join(path_genbank, 'bacteria', entry)
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'NCBI:genbank', entry, this_db
                    ]

        elif dbs2use[0] == 'tax_id':
            tax_id_entries = db2use_abs

    ###################
    #### user_data ####
    ###################
    elif (source == 'user_data'):
        ### Check if folder exists
        db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder)

        user_entries = os.listdir(db2use_abs)
        for entry in user_entries:
            this_db = db2use_abs + '/' + entry
            db_Dataframe.loc[len(db_Dataframe)] = ['user_data', entry, this_db]

    #################
    #### PubMLST ####
    #################
    elif (source == 'MLST'):
        ### get information
        for db in dbs2use:
            if db == 'PubMLST':
                ### Check if folder exists
                db2use_abs = HCGB_files.create_subfolder(
                    'PubMLST', database_folder)
                list_profiles = os.listdir(db2use_abs)

                for entry in list_profiles:
                    this_db = db2use_abs + '/' + entry
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'MLST', 'PubMLST', entry + ',' + this_db
                    ]
                    print(
                        colored(
                            "\t- MLST: including information from profile: " +
                            entry, 'green'))

            else:
                db_Dataframe.loc[len(db_Dataframe)] = [
                    'MLST', 'user_profile', db
                ]
                print(
                    colored(
                        "\t- MLST: including information from profile provided by user: "******"genbank"):

                ### Check if folder exists
                db2use_abs = database_folder + '/NCBI/genbank/bacteria'
                if os.path.exists(db2use_abs):
                    print(
                        colored(
                            "\n\t- genbank: including information from different reference strains available.",
                            'green'))  ## include data from NCBI
                    genbank_entries = os.listdir(db2use_abs)
                    for entry in genbank_entries:
                        print('\t+ Reading information from sample: ', entry)
                        this_db = db2use_abs + '/' + entry

                        ## get additional information from
                        info_file = this_db + '/info.txt'
                        info_data = pd.read_csv(info_file).set_index('ID')

                        info_data.fillna("NaN", inplace=True)

                        ## get readable name for each strain
                        entry_strain = str(info_data.loc[entry]['name'])

                        if entry_strain == 'NaN':  ## TODO: debug if it works
                            entry_strain = entry
                            print()
                        else:
                            print('\t\t+ Rename into: ', entry_strain)

                        list_msh = HCGB_main.retrieve_matching_files(
                            this_db, '.sig', debug)
                        if (list_msh):
                            ## print original in file
                            file2print = this_db + '/.original'
                            if not os.path.exists(file2print):
                                original = ['NaN']
                            else:
                                original = HCGB_main.readList_fromFile(
                                    file2print)

                            db_Dataframe.loc[len(db_Dataframe)] = [
                                'genbank', entry_strain, list_msh[0],
                                this_db + '/mash/' + original[0], original[1],
                                original[2], this_db
                            ]
                        else:
                            ## index assembly or reads...
                            list_fna = HCGB_main.retrieve_matching_files(
                                this_db, 'genomic.fna', debug)

                            ## not available
                            db_Dataframe.loc[len(db_Dataframe)] = [
                                'genbank', entry_strain, 'NaN', list_fna[0],
                                'NaN', 'NaN', this_db
                            ]

            #### user_data
            elif (db == "user_data"):
                print(
                    colored(
                        "\n\t- user_data: including information from user previously generated results",
                        'green'))  ## include user data
                db2use_abs = HCGB_files.create_subfolder(
                    'user_data', database_folder)
                user_entries = os.listdir(db2use_abs)
                for entry in user_entries:
                    if entry == 'user_database.csv':
                        continue

                    print('\t+ Reading information from sample: ', entry)
                    this_db = db2use_abs + '/' + entry
                    this_mash_db = this_db + '/mash/' + entry + '.sig'
                    if os.path.exists(this_mash_db):
                        ## print original in file
                        file2print = this_db + '/mash/.original'
                        if not os.path.exists(file2print):
                            original = ['NaN', 'NaN', 'NaN']
                        else:
                            original = HCGB_main.readList_fromFile(file2print)

                        ##
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'user_data', entry, this_mash_db,
                            this_db + '/mash/' + original[0], original[1],
                            original[2], this_db + '/mash'
                        ]
                    else:
                        ## not available
                        list_fna = HCGB_main.retrieve_matching_files(
                            this_db + '/assembly', '.fna', debug)
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'user_data', entry, 'NaN', list_fna[0], 'NaN',
                            'NaN', this_db + '/mash'
                        ]

    #### external_data
    ### TODO: Fix this
    mash_bin = ""  #set_config.get_exe('mash')
    if any(name in 'Mash_external'
           for name in db_Dataframe['source'].to_list()):
        print(
            colored(
                "\t- external_data: including information from external data provided by user",
                'green'))  ## include user data
        db_Dataframe = db_Dataframe.set_index("db", drop=False)
        frame = db_Dataframe[db_Dataframe['source'] == 'Mash_external']
        for index, row in frame.iterrows():
            print('\t+ Reading information for file: ', row['db'])
            outfile = row['path'] + '.msh'
            if not os.path.exists(outfile):
                path_file = os.path.dirname(row['path'])
                this_db_file = min_hash_caller.sketch_database([row['path']],
                                                               mash_bin,
                                                               row['path'],
                                                               row['db'],
                                                               path_file)
                HCGB_aes.print_sepLine("*", 50, False)

            db_Dataframe.loc[row['db']] = [
                'Mash_external', row['db'], outfile, row['path']
            ]

    ## index by id
    db_Dataframe = db_Dataframe.set_index("db", drop=False)
    return (db_Dataframe)
Esempio n. 3
0
def run_search(arg_dict):
    """Main function of the search module in BacDup package.
    
    This module searches and create gene duplication analysis. 
    
    It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user
    annotation data) or a single or multiple samples.    
    """

    ## help message
    if (arg_dict.input_help):
        help_input()
        exit()

    if (arg_dict.blast_help):
        info.blast_help()
        exit()

    if (arg_dict.project_help):
        info.project_help()
        exit()

    if (arg_dict.detached_mode_help):
        info.detached_mode()
        exit()

    ### Start the analysis
    BacDup_functions.pipeline_header('BacDup')
    HCGB_aes.boxymcboxface("Search module")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    outdir = os.path.abspath(arg_dict.input_folder)

    ## project or detached?
    if arg_dict.detached:
        arg_dict.project = False
        ## output folder
        print("\n+ Create output folder(s):")
        HCGB.functions.files_functions.create_folder(outdir)
    else:
        arg_dict.project = True

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('Project/Detached option:', 'yellow')
        debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow')
        debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow')
        debug_message('outdir:' + outdir, 'yellow')
        debug_message('+++++++++++++++++++++++++++++++')

    ## get files
    print()
    HCGB_aes.print_sepLine("-", 50, False)
    print('+ Getting information provided... ')
    print('+ Several options available:')
    print('\t* BacDup project folder with initiated data')
    print('\t* Single/Multiple Annotation file:')
    print('\t  |-- GenBank format files')
    print('\t  |-- GFF files +  Reference fasta files required')
    print('\t* Single/Multiple raw BLAST results files')
    print('\t* Single/Multiple fasta proteins + annotation table')

    print("""\n\n**** NOTE: **** 
    For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs)
    use the input module to accommodate accordingly """)
    time.sleep(1)

    print()

    ## parse options
    pd_samples_retrieved = parse_search_options(arg_dict)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## for each sample
    dict_search_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "search",
        arg_dict.debug)

    dict_dup_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug)

    dict_parse_folders = HCGB.functions.files_functions.outdir_project(
        outdir, arg_dict.project, pd_samples_retrieved, "parse",
        arg_dict.debug)

    ## create results
    data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table())
    for sample, folder in dict_search_folders.items():

        annot_timestamp = os.path.join(dict_dup_folders[sample],
                                       '.annot_success')
        dup_annot_file = os.path.join(dict_dup_folders[sample],
                                      'dup_annot.csv')

        ## annotation
        annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table']

        if (not HCGB.functions.files_functions.is_non_zero_file(
                annot_timestamp)):

            ## get results
            file_data = pd_samples_retrieved.loc[sample, 'file_data']
            format = pd_samples_retrieved.loc[sample, 'format']
            filtered_data = dup_searcher.filter_data(
                sample, file_data, format, arg_dict.pident, arg_dict.evalue,
                arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug)

            ## timestamps
            filter_timestamp = os.path.join(dict_dup_folders[sample],
                                            '.filter_success')
            if (not HCGB.functions.files_functions.is_non_zero_file(
                    filter_timestamp)):
                #save results as a .csv file
                sort_csv = os.path.abspath(
                    os.path.join(dict_dup_folders[sample],
                                 'filtered_results.csv'))
                filtered_data.to_csv(sort_csv, header=True, index=False)

                ## print time stamp
                HCGB_time.print_time_stamp(filter_timestamp)
            else:
                read_time = HCGB_time.read_time_stamp(filter_timestamp)
                print(
                    colored(
                        "\t+ Filter results already available for sample %s [%s]"
                        % (sample, read_time), 'green'))

            ## get annotation
            (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot(
                sample, filtered_data, annot_table_file, arg_dict.debug)

            ##
            info_dup_file = os.path.join(dict_dup_folders[sample],
                                         'info_dup.csv')
            data2add_entry.to_csv(info_dup_file, header=True, index=False)

            ## save into file
            dup_annot_df.to_csv(dup_annot_file, header=True)

            ## print time stamp
            HCGB_time.print_time_stamp(annot_timestamp)

        else:
            read_time = HCGB_time.read_time_stamp(annot_timestamp)
            print(
                colored(
                    "\t+ Duplicate annotation already available for sample %s [%s]"
                    % (sample, read_time), 'green'))

            ## add info for each
            dup_annot_df = HCGB_main.get_data(dup_annot_file, ',',
                                              "index_col=0")
            annot_table = HCGB_main.get_data(annot_table_file, ',',
                                             "index_col=0")
            data2add_entry = dup_searcher.get_dup_stats(
                sample, dup_annot_df, annot_table, arg_dict.debug)

        ## add genome length data
        data2add_entry['genome_len'] = ''
        len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv')
        if os.path.isfile(len_df_file):
            len_data = HCGB_main.get_data(len_df_file, ',', "header=None")
            data2add_entry['genome_len'] = len_data[1].sum()

        ## merge data
        #data2add_entry = data2add_entry.reset_index()
        data2add = data2add.append(data2add_entry, ignore_index=False)

    ### report generation
    HCGB_aes.boxymcboxface("Summarizing duplicated search")
    outdir_report = HCGB.functions.files_functions.create_subfolder(
        "report", outdir)
    dups_report = HCGB.functions.files_functions.create_subfolder(
        "dups", outdir_report)

    ## add data2add
    data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'),
                    index=True,
                    header=True)

    ## maybe add a summary of the files?

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting search module.")
    return ()
Esempio n. 4
0
def download_ariba_databases(list_dbs, main_folder, Debug, threads):

	"""Download ARIBA_ databases.
	
	Using ARIBA software this function retrieves desired databases and prepare them for later analysis.
	
	:param list_dbs: List of databases to download.
	:param main_folder: Absolute path to database folder.
	:param Debug: True/false for printing developer messages
	:param threads: Number of CPUs to use.
	
	:type list_dbs: string 
	:type main_folder: string
	:type Debug: Boolean
	:type threads: integer
	
	 .. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.file_functions.create_subfolder`
		
		- :func:`HCGB.functions.time_functions.read_time_stamp`
		
		- :func:`BacterialTyper.scripts.ariba_caller.get_ARIBA_dbs`
	
		- :func:`BacterialTyper.scripts.ariba_caller.ariba_getref`		
		
	 
	.. include:: ../../links.inc
	"""

	print("\n\n+ Download databases for Antimicrobial Resistance Identification By Assembly (ARIBA).")
	ariba_folder = HCGB_files.create_subfolder("ARIBA", main_folder)

	## print ARIBA databases: 
	print ("+ Available databases:")
	dbs = get_ARIBA_dbs(list_dbs)
	
	for db_set in dbs:

		HCGB_aes.print_sepLine("-",30, False)
		print (colored("+ " + db_set,'yellow'))
		
		## prepare folders
		folder_set = HCGB_files.create_subfolder(db_set, ariba_folder)
		outdir_prepare_ref = folder_set + '_prepareref'

		## stamp time file
		filename_stamp_prepare = outdir_prepare_ref + '/.success'
	
		## check if previously done
		if os.path.isfile(filename_stamp_prepare):
			stamp =	HCGB_time.read_time_stamp(filename_stamp_prepare)
			print ("\t+ Database is downloaded in folder: ", folder_set)
			print ("\t+ Data is available and indexed in folder: ", outdir_prepare_ref)
			print (colored("\tDatabase was previously downloaded and prepared on: %s" %stamp, 'yellow'))
		
			## Check if necessary to download again after several months/days
			days_passed = HCGB_time.get_diff_time(filename_stamp_prepare)
			print ("\t\t** %s days ago" %days_passed)		
			if (days_passed > 30): ## download again
				print ("\t\t** Downloading information again just to be sure...")
				return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads)
			else:
				return_ariba_getref = 'OK'
		else:
			return_ariba_getref = ariba_getref(db_set, folder_set, Debug, threads)
		
		if (return_ariba_getref == 'OK'):
			print()
		else:
			print (colored("** ARIBA getref failed or generated a warning for " + db_set, 'red'))
Esempio n. 5
0
def run_input(arg_dict):
    """Main function of the input_parser module in BacDup package.
    
    This module prepares data for later gene duplication analysis. 
    
    It allows the user to provide either a single sample, multiple samples, NCBI 
    GenBank IDs or NCBI taxonomy IDs to retrieve and obtain the annotation data.    
    """

    ## help message
    if (arg_dict.input_help):
        help_input()
        exit()

    BacDup_functions.pipeline_header('BacDup')
    HCGB_aes.boxymcboxface("Preparing input files")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## init time
    start_time_total = time.time()

    ## absolute path for in & out
    #input_dir = os.path.abspath(options.input)
    outdir = os.path.abspath(arg_dict.output_folder)

    ## output folder
    print("\n+ Create output folder(s):")
    HCGB_files.create_folder(outdir)

    ## set defaults
    if not (arg_dict.assembly_level):
        arg_dict.assembly_level = 'complete'
    if not (arg_dict.section):
        arg_dict.section = 'genbank'

    ## project or detached?
    if arg_dict.detached:
        arg_dict.project = False
        final_dir = outdir
        data_dir = outdir
    else:
        arg_dict.project = True
        print(
            "+ Generate a directory containing information within the project folder provided"
        )
        final_dir = HCGB_files.create_subfolder("info", outdir)

    ## debug messages
    if (arg_dict.debug):
        debug_message('+++++++++++++++++++++++++++++++')
        debug_message('Project/Detached option:', 'yellow')
        debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow')
        debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow')
        debug_message('outdir:' + outdir, 'yellow')
        debug_message('final_dir:' + final_dir, 'yellow')
        debug_message('+++++++++++++++++++++++++++++++')

    ## get files
    print()
    HCGB_aes.print_sepLine("-", 50, False)
    print('+ Getting input information provided... ')
    print('+ Several options available:')
    print('\t* Single/Multiple Annotation file:')
    print('\t  |-- GenBank format files')
    print('\t  |-- GFF files +  Reference fasta files required')
    print('\n\t* Single/Multiple NCBI GenBank IDs')
    print('\n\t* Single/Multiple NCBI taxonomy IDs + Options')
    print('\n\t* A previous BacDup project folder')

    print('\n+ Check the option provided...')
    time.sleep(1)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    #################################################
    ## Parse and obtain the type of input information provided
    #################################################
    df_accID = parse_options(arg_dict)
    ## pd.DataFrame: 'new_name','folder','genus',
    ##               'species','taxonomy','genome',
    ##               'annot_file','format_annot_file', 'proteins',
    ##               'plasmids_number','plasmids_ID'))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ## parse information accordingly
    parse_information(arg_dict, df_accID, outdir)

    ### report generation
    HCGB_aes.boxymcboxface("Summarizing input files")
    outdir_report = HCGB_files.create_subfolder("report", outdir)

    input_report = HCGB_files.create_subfolder("input", outdir_report)

    ## add df_accID.loc[sample,] information as csv into input folder
    df_accID.to_csv(os.path.join(input_report, 'info.csv'),
                    index=True,
                    header=True)

    ## maybe add a summary of the files?

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Input module.")
    return ()
Esempio n. 6
0
def get_options_db(options):
    """Select databases to use according to the input options.
	
	:param options:
	
	:returns: Dataframe with database information among all databases available.
	"""

    print("\n\n+ Select databases to use for identification:")

    ### database folder to use
    database2use = os.path.abspath(options.database)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: Database to use: " + database2use + " **",
                    'yellow'))

    ## according to user input: select databases to use
    option_db = ""

    ############################################################
    ## Default db KMA
    ############################################################
    kma_dbs = []
    if not options.only_kma_db:  ## exclusive
        #kma_dbs = ["bacteria", "plasmids"]
        kma_dbs = ["bacteria"]

    if (options.kma_dbs):
        options.kma_dbs = options.kma_dbs + kma_dbs
        options.kma_dbs = set(options.kma_dbs)
    else:
        options.kma_dbs = kma_dbs

    ## rise error & exit if no dbs provided
    if not (options.kma_dbs):
        print(
            colored("***ERROR: No database provided via --kma_db option.\n",
                    'red'))
        exit()

    ############################################################
    ### Options:

    ############
    ## 1) only user data: previously identified and added
    ############
    if (options.only_user_data):
        option_db = "user_data"

    ############
    ## 2) only genbank data: previously download from NCBI reference genomes
    ############
    elif (options.only_genbank_data):
        option_db = "genbank"

    ############
    ## 3) only external kma
    ############
    elif (options.only_external_kma):
        option_db = get_external_kma(options.kma_external_files, Debug)
        ## rise attention
        if (options.kma_dbs):
            print(
                colored(
                    "***ATTENTION:\nDefatult databases and databases provided via --kma_dbs option would not be used as --only_external_kma option provided.\n",
                    'red'))

    #################
    ## all databases
    #################
    else:
        ####################
        ## default KMA dbs
        ####################
        print('\t- Selecting kma databases:')
        kma_dbs_string = ','.join(options.kma_dbs)
        option_db = "kma:" + kma_dbs_string

        for i in options.kma_dbs:
            print(colored('\t\t+ %s' % i, 'green'))

        #################
        ## External file
        #################
        if (options.kma_external_files):
            option_db_tmp = get_external_kma(options.kma_external_files, Debug)
            option_db = option_db + '#' + option_db_tmp

        #############################
        ## Previously identified data
        #############################
        if any([options.user_data, options.all_data]):
            option_db = option_db + '#kma_user_data:user_data'

        #############################
        ## Genbank reference data
        #############################
        if any([options.genbank_data, options.all_data]):
            option_db = option_db + '#kma_NCBI:genbank'

    ###############
    ### PubMLST ###
    ###############
    print("\n\t - Select MLST profiles")
    option_db_PubMLST = 'MLST:PubMLST'
    print(
        colored("\t\t + Default MLST profile under database provided: PubMLST",
                'green'))

    if options.MLST_profile:
        ## user provides a PubMLST profile
        options.MLST_profile = os.path.abspath(options.MLST_profile)
        option_db_PubMLST = option_db_PubMLST + '#MLST:' + options.MLST_profile
        print(
            colored(
                "\t\t + User provided MLST profile: %s" % options.MLST_profile,
                'green'))

    ###############
    ### get dbs
    ###############
    print("\n+ Parsing information to retrieve databases")
    print("+ Reading from database: " + database2use)
    HCGB_aes.print_sepLine("-", 50, False)

    ###############
    ## debug message
    if (Debug):
        print(colored("**DEBUG: option_db: " + option_db + " **", 'yellow'))
        print(
            colored(
                "**DEBUG: option_db_PubMLST : " + option_db_PubMLST + " **",
                'yellow'))

    pd_KMA = database_generator.getdbs("KMA", database2use, option_db, Debug)
    pd_PubMLST = database_generator.getdbs("MLST", database2use,
                                           option_db_PubMLST, Debug)

    HCGB_aes.print_sepLine("-", 50, False)

    ## return both dataframes
    pd_Merge = pd.concat([pd_KMA, pd_PubMLST], sort=True, ignore_index=True)
    return (pd_Merge)
Esempio n. 7
0
def run_database(options):

    ## init time
    start_time_total = time.time()
    start_time_partial = start_time_total

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
        print("[Debug mode: ON]")
    else:
        Debug = False

    ## message header
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Database")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    kma_bin = set_config.get_exe("kma")

    ######################################################
    ## print further information if requested
    if (options.help_ARIBA):
        print("ARIBA databases information:")
        ariba_caller.help_ARIBA()
        exit()

    elif (options.help_BUSCO):
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_KMA):
        species_identification_KMA.help_kma_database()
        exit()
    ######################################################

    ## create folder
    ## absolute
    options.path = os.path.abspath(options.path)
    HCGB_files.create_folder(options.path)

    #########
    if Debug:
        print(colored("DEBUG: absolute path folder: " + options.path,
                      'yellow'))

    ##########
    ## NCBI	##
    ##########
    ## if any NCBI options provided
    if any([options.ID_file, options.descendant]):
        ## create folders
        NCBI_folder = HCGB_files.create_subfolder('NCBI', options.path)
        if (options.ID_file):
            ## get path and check if it is file
            abs_path_file = os.path.abspath(options.ID_file)
            if os.path.isfile(abs_path_file):
                print()
                HCGB_aes.print_sepLine("*", 50, False)
                print("--------- Check NCBI ids provided ---------\n")
                HCGB_aes.print_sepLine("*", 70, False)
                ## get file information
                print("\t+ Obtaining information from file: %s" %
                      abs_path_file)
                strains2get = HCGB_main.get_data(abs_path_file, ',', '')
                dataBase_NCBI = database_generator.NCBI_DB(
                    strains2get, NCBI_folder, Debug)

                #########
                if Debug:
                    print(colored("DEBUG: NCBI data provided: ", 'yellow'))
                    print(options.ID_file)

                ## functions.timestamp
                start_time_partial = HCGB_time.timestamp(start_time_partial)
                ## strains downloaded would be included to a kma index

        ## Get all entries belonging to this taxon provided
        if (options.descendant):
            #########
            if Debug:
                print(colored("DEBUG: NCBI descendant option: ON ", 'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print(
                "--------- Check descendant NCBI taxonomy ids provided ---------\n"
            )
            HCGB_aes.print_sepLine("*", 70, False)
            ## [TODO]
            dataBase_NCBI = database_generator.NCBI_descendant(
                options.descendant, NCBI_folder, Debug)

        ##############################################################
        ## update KMA database with NCBI information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_NCBI['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        genbank_kma_db = HCGB_files.create_subfolder('genbank', kma_db)

        print('+ Database to update: ', genbank_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'genbank_KMA',
                                               genbank_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ###############
    ## user_data ##
    ###############
    if options.project_folder:

        ##
        dataBase_user = pd.DataFrame()
        ## get absolute path
        abs_project_folder = os.path.abspath(options.project_folder)
        if os.path.exists(abs_project_folder):
            #########
            if Debug:
                print(
                    colored("DEBUG: User provides folder containing project",
                            'yellow'))

            print()
            HCGB_aes.print_sepLine("*", 70, False)
            print("--------- Check user provided project folder ---------")
            HCGB_aes.print_sepLine("*", 70, False)
            dataBase_user = database_user.update_database_user_data(
                options.path, abs_project_folder, Debug, options)
        else:
            print(
                colored(
                    "ERROR: Folder provided does not exists: %s" %
                    options.project_folder, 'red'))
            exit()

        ##############################################################
        ## update KMA database with user_data information retrieved
        ##############################################################
        print('\n\n+ Update database for later identification analysis...')
        list_of_files = dataBase_user['genome'].tolist()
        kma_db = HCGB_files.create_subfolder('KMA_db', options.path)
        user_kma_db = HCGB_files.create_subfolder('user_data', kma_db)

        print('+ Database to update: ', user_kma_db)
        species_identification_KMA.generate_db(list_of_files, 'userData_KMA',
                                               user_kma_db, 'new', 'batch',
                                               Debug, kma_bin)

        ## time stamp
        start_time_partial = HCGB_time.timestamp(start_time_total)

    ##########
    ## ARIBA
    ##########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check ARIBA parameters provided --------")
    HCGB_aes.print_sepLine("*", 50, False)
    if (options.no_ARIBA):
        print("+ No ARIBA databases would be downloaded...")

        #########
        if Debug:
            print(colored("DEBUG: No option ARIBA", 'yellow'))

    else:
        #functions.print_sepLine("*",50, False)

        ### ariba list databases
        ariba_dbs_list = ['CARD', 'VFDB']

        if (options.no_def_ARIBA):
            ariba_dbs_list = options.ariba_dbs
        else:
            if (options.ariba_dbs):
                ariba_dbs_list = ariba_dbs_list + options.ariba_dbs
                ariba_dbs_list = set(ariba_dbs_list)

        #########
        if Debug:
            print(colored("DEBUG: Option ARIBA", 'yellow'))
            print(options.ariba_dbs)

        ariba_caller.download_ariba_databases(ariba_dbs_list, options.path,
                                              Debug, options.threads)

        ### ariba list databases
        if (options.ariba_users_fasta):
            print(
                "+ Generate ARIBA database for databases provided: prepare fasta and metadata information"
            )

            #########
            if Debug:
                print(colored("DEBUG: Option user ARIBA db", 'yellow'))
                print(ariba_users_fasta)
                print(ariba_users_meta)

            ## [TODO]:
            ## ariba prepareref fasta and metadata

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    #########
    ## kma ##
    #########
    print()
    HCGB_aes.print_sepLine("*", 50, False)
    print("--------- Check KMA parameters provided ----------")
    kma_database = options.path + '/KMA_db'
    HCGB_files.create_folder(kma_database)

    ## types: bacteria, archaea, protozoa, fungi, plasmids, typestrains
    ## downloads all "bacterial" genomes from KMA website
    ## kma: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder/version/

    print(
        "+ Retrieving information from: ftp://ftp.cbs.dtu.dk/public/CGE/databases/KmerFinder website"
    )

    ## KMA databases to use
    ## only user dbs
    if (options.no_def_kma):
        if (options.kma_dbs):
            print("+ Only user databases selected will be indexed...")
        else:
            print("+ No databases selected.")
            print(colored("ERROR: Please select a kma database.", 'red'))
            exit()

    ## default dbs + user
    else:
        kma_dbs = ["bacteria", "plasmids"]

        ## default dbs + user
        if (options.kma_dbs):
            options.kma_dbs = options.kma_dbs + kma_dbs
            options.kma_dbs = set(options.kma_dbs)
        else:
            options.kma_dbs = kma_dbs

    #########
    if Debug:
        print(colored("DEBUG: options.kma_dbs", 'yellow'))
        print(options.kma_dbs)

    ## Get databases
    for db in options.kma_dbs:
        print(colored("\n+ " + db, 'yellow'))
        db_folder = HCGB_files.create_subfolder(db, kma_database)
        species_identification_KMA.download_kma_database(db_folder, db, Debug)

    ### timestamp
    start_time_partial = HCGB_time.timestamp(start_time_partial)

    ###########
    ## BUSCO ##
    ###########
    if (options.BUSCO_dbs):
        print()
        HCGB_aes.print_sepLine("*", 50, False)
        print("--------- Check BUSCO datasets provided ---------")
        BUSCO_folder = HCGB_files.create_subfolder("BUSCO", options.path)

        #########
        if Debug:
            print(colored("DEBUG: options.BUSCO_dbs", 'yellow'))
            print(options.BUSCO_dbs)

        print("+ BUSCO datasets would be downloaded when executed...")
        #BUSCO_caller.BUSCO_retrieve_sets(options.BUSCO_dbs, BUSCO_folder)

        ### timestamp
        start_time_partial = HCGB_time.timestamp(start_time_partial)

    print("\n*************** Finish *******************\n")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Database module.\n")
    return ()
Esempio n. 8
0
def get_userData_info(options, project_folder):
    ## get information regarding:
    ## genus, species (ident module)
    ## card & VFDB (profile module)
    ## additional information: MGE, etc

    ## get profile information
    print()
    HCGB_aes.print_sepLine("-", 60, 'yellow')
    print("+ Retrieve virulence/resistance profile information:")
    pd_samples_profile = sampleParser.files.get_files(options, project_folder,
                                                      "profile", ["csv"],
                                                      options.debug)
    if not pd_samples_profile.empty:
        pd_samples_profile = pd_samples_profile.set_index('name')
    HCGB_aes.print_sepLine("-", 60, 'yellow')

    ## get identification information
    print()
    HCGB_aes.print_sepLine("-", 60, 'yellow')
    print("+ Retrieve species identification information:")
    pd_samples_ident = sampleParser.files.get_files(options, project_folder,
                                                    "ident", ["csv"],
                                                    options.debug)
    if not pd_samples_ident.empty:
        pd_samples_ident = pd_samples_ident.set_index('name')
    HCGB_aes.print_sepLine("-", 60, 'yellow')

    ## get mash information
    print()
    HCGB_aes.print_sepLine("-", 60, 'yellow')
    print("+ Retrieve cluster information:")
    pd_samples_mash = sampleParser.files.get_files(options, project_folder,
                                                   "mash", ["sig"],
                                                   options.debug)
    if not pd_samples_mash.empty:
        pd_samples_mash = pd_samples_mash.set_index('name')
    HCGB_aes.print_sepLine("-", 60, 'yellow')
    print()

    ## add other if necessary

    ## debug message
    if (options.debug):
        print(colored("**DEBUG: pd_samples_profile **", 'yellow'))
        print(pd_samples_profile)
        print(colored("**DEBUG: pd_samples_ident **", 'yellow'))
        print(pd_samples_ident)
        print(colored("**DEBUG: pd_samples_mash **", 'yellow'))
        print(pd_samples_mash)

    ## merge
    df = pd.concat([pd_samples_profile, pd_samples_ident, pd_samples_mash],
                   join='inner',
                   sort=True).drop_duplicates()
    ## joining by inner we only get common columns among all

    ## debug message
    if (options.debug):
        print(colored("**DEBUG: pd_concat **", 'yellow'))
        print(df)

    ## set new column with name of samples
    df = df.reset_index()

    ## rename column
    df.rename(columns={'index': 'name'}, inplace=True)

    ## debug message
    if (options.debug):
        print(colored("**DEBUG: pd_concat reset_index**", 'yellow'))
        print(df)

    ##
    return (df)
Esempio n. 9
0
def print_all():

    HCGB_aes.print_sepLine("+", 50, False)
    print('Python:')
    HCGB_aes.print_sepLine("+", 50, False)
    print('Python version:', str(sys.version))
    print('\n')
    HCGB_aes.print_sepLine("+", 50, False)
    print('Python packages:')
    extern_progs.print_package_version()
    HCGB_aes.print_sepLine("+", 50, False)
    print('\n')

    HCGB_aes.print_sepLine("+", 50, False)
    print('External dependencies:\n')
    HCGB_aes.print_sepLine("+", 50, False)
    extern_progs.print_dependencies()
    print('\n')

    print('Additional dependencies: databases, information, etc...')

    HCGB_aes.print_sepLine("*", 50, False)
    print("ARIBA databases version..")
    HCGB_aes.print_sepLine("*", 50, False)
    print("card -> ")
    print("megares  ->")
    print("plasmidfinder  ->")
    print("resfinder  ->")
    print("srst2_argannot  ->")
    print("vfdb_core & vfdb_full  ->")
    print("virulencefinder  ->")
    print('\n')
Esempio n. 10
0
def help_fastq_format():
    """
    Explanation of fastq format details.
    """

    HCGB_aes.boxymcboxface("Name format for samples")

    print("Format for fastq files can be:")
    print(
        "name.fastq.gz, name_1.fastq.gz, name_R2.fastq.gz, name_L001_R1.fastq.gz, name_L001_R1_001.fastq.gz etc."
    )
    print(
        "\nThere are many options and here we provide some guidelines on the name format."
    )
    print("\n")

    HCGB_aes.print_sepLine("*", 20, "red")
    print("[1] Length limitation")
    HCGB_aes.print_sepLine("*", 20, "red")
    print("There is a limitation for the sample ID ('name') of 25 characters.")
    print(
        colored(
            "** BacterialTyper provides an option to rename samples if necessary: module prep option --rename **",
            'yellow'))
    print("\n")

    HCGB_aes.print_sepLine("*", 20, "red")
    print("[2] Single end files")
    HCGB_aes.print_sepLine("*", 20, "red")
    print(
        colored(
            '** Use option --single-end in the different BacterialTyper modules **',
            'yellow'))
    print("name.fastq.gz")
    print("name.fastq")
    print("name.fq")
    print("\n")

    HCGB_aes.print_sepLine("*", 20, "red")
    print("[3] Paired-end files")
    HCGB_aes.print_sepLine("*", 20, "red")
    print(
        "Paired-end files are full supported. The format for these files are:")
    print("Read1 => name_1.fastq.g or name_R1.fastq.gz")
    print("Read2 => name_2.fastq.gz or name_R2.fastq.gz")
    print(
        colored('** See additional details for Lane information **', 'yellow'))
    print("\n")

    HCGB_aes.print_sepLine("*", 55, "red")
    print("[4] Lane information:")
    HCGB_aes.print_sepLine("*", 55, "red")
    print(
        "In some cases, files might contain lane information (*L00x* and/or *00x*)."
    )
    print(
        "BacterialTyper supports these names as long as follow these examples:"
    )
    print("name_L00x_R1.fastq.gz\tname_L00x_R2.fastq.gz")
    print("name_L00x_1.fastq.gz\tname_L00x_2.fastq.gz")
    print("name_L00x_R1.fastq.gz\tname_L00x_R2.fastq.gz")
    print("name_L00x_R1_00x.fastq.gz\tname_L00x_R2_00x.fastq.gz")
    print("\n")
    print(
        "Sometimes it might be appropriate to include lane tags (*L00X*) within the name."
    )
    print(colored("** Use option --include-lane within each module", 'yellow'))

    print(
        colored(
            "\n** If you need to merge fastq files from different lanes, use option within module prep **",
            'yellow'))
    print("As an example:")
    print(colored("\n** Option --merge within module prep **", 'yellow'))
    print("sample1_L001_R1.fastq.gz\tsample1_L001_R2.fastq.gz")
    print("sample1_L002_R1.fastq.gz\tsample1_L002_R2.fastq.gz")
    print("sample1_L003_R1.fastq.gz\tsample1_L003_R2.fastq.gz")
    print("sample1_L004_R1.fastq.gz\tsample1_L004_R2.fastq.gz")
    print("Result:")
    print("--------------------------------------------------")
    print("sample1_R1.fastq.gz\tsample1_R2.fastq.gz")
    print("\n")
    print(
        colored("\n** Option --merge-by-lane within module prep **", 'yellow'))
    print("sample1_L001_R1_001.fastq.gz\tsample1_L001_R2_001.fastq.gz")
    print("sample1_L001_R1_002.fastq.gz\tsample1_L001_R2_002.fastq.gz")
    print("sample1_L002_R1_001.fastq.gz\tsample1_L002_R2_001.fastq.gz")
    print("sample1_L002_R1_002.fastq.gz\tsample1_L002_R2_002.fastq.gz")
    print("--------------------------------------------------")
    print("Result:")
    print("sample1_L001_R1.fastq.gz\tsample1_L001_R2.fastq.gz")
    print("sample1_L002_R1.fastq.gz\tsample1_L002_R2.fastq.gz")
    print(
        colored("** Remember to use option --include_lane within each module",
                'yellow'))
    print("\n")

    HCGB_aes.print_sepLine("*", 55, "red")
    print("[5] Include all information:")
    HCGB_aes.print_sepLine("*", 55, "red")
    print(
        "In some cases, files might contain other information and it is necessay to "
        + "include it all as a tag nane. See as an example:")
    print("sample1_L001_XYZ_R1_001.fastq.gz\tsample1_L001_XYZ_R2_001.fastq.gz")
    print(
        colored("** Remember to use option --include_all within each module",
                'yellow'))
    print(
        colored(
            "** It might be appropiate to change samples names using --rename option under prep module",
            'yellow'))

    print("\n")
    HCGB_aes.print_sepLine("*", 15, "red")
    print("[6] Extensions:")
    HCGB_aes.print_sepLine("*", 15, "red")
    print(
        "name_L00x_R2.fastq\tname_L00x_R2.fq\nname_L00x_R2.fastq.gz\tname_L00x_R2.fq.gz"
    )
    print("\n")
Esempio n. 11
0
def print_all():
    print("")
    HCGB_aes.print_sepLine("+", 50, 'yellow')
    print("\tSOFTWARE")
    HCGB_aes.print_sepLine("+", 50, 'yellow')
    print(
        "Third party softwares included or employed during the pipeline workflow."
    )
    print("")
    df_software_citation = pd.DataFrame.from_dict(
        software_citation(),
        orient='index',
        columns=('Article Title', 'Authors', 'PUBMED ID', 'Website'))
    df_software_citation.index.names = ['Software']
    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.max_columns', None)
    print(df_software_citation)
    print("")

    HCGB_aes.print_sepLine("+", 50, 'yellow')
    print("\tDATABASES")
    HCGB_aes.print_sepLine("+", 50, 'yellow')
    print("")
    print("Please cite according to your selection.")
    print("")

    HCGB_aes.print_sepLine("+", 50, False)
    print("\tARIBA databases")
    HCGB_aes.print_sepLine("*", 50, False)
    df_ARIBA_DB_citation = pd.DataFrame.from_dict(
        ariba_citation(),
        orient='index',
        columns=('Article Title', 'Authors', 'PUBMED ID', 'Website'))
    df_ARIBA_DB_citation.index.names = ['Databases']
    print(df_ARIBA_DB_citation)
    print("\n")

    HCGB_aes.print_sepLine("*", 50, False)
    print("\tKMA software & databases")
    HCGB_aes.print_sepLine("*", 50, False)

    print()
    print()

    HCGB_aes.print_sepLine("*", 50, False)
    print("\tBUSCO software & dataset")
    HCGB_aes.print_sepLine("*", 50, False)
    print(
        "BUSCO applications from quality assessments to gene prediction and phylogenomics."
    )
    print(
        "Robert M. Waterhouse, Mathieu Seppey, Felipe A. Simão, Mose Manni, Panagiotis "
    )
    print(
        "Ioannidis, Guennadi Klioutchnikov, Evgenia V. Kriventseva, and Evgeny M. Zdobnov"
    )
    print(
        "Mol Biol Evol, published online Dec 6, 2017, doi: 10.1093/molbev/msx319"
    )
    print()
    print(
        "BUSCO: assessing genome assembly and annotation completeness with single-copy orthologs."
    )
    print(
        "Felipe A. Simão, Robert M. Waterhouse, Panagiotis Ioannidis, Evgenia "
    )
    print("V. Kriventseva, and Evgeny M. Zdobnov")
    print(
        "Bioinformatics, published online June 9, 2015, doi: 10.1093/bioinformatics/btv351"
    )
    print()
    print(
        "For further details, please visit: https://busco.ezlab.org/ or https://www.orthodb.org/"
    )

    print()
    print()
Esempio n. 12
0
def run(options):
    """
	This is the main function of the module ``config``. It basically checks 
	if the different requirements (python` and third-party software) are
	fulfilled. 

	If any requirement is not available this modules tries to install them or reports to the user to
	manually install them.

	:param option: State whether to check or install missing modules, packages and third party software. Provide: check/install
	:param install_path: Absolute path to install modules or packages missing. Default: ``BacterialTyper`` environment path.
	:param IslandPath: True/False for checking additional perl and software required by this option analysis.
	:param debug: True/false for debugging messages.
	
	:type option: string 
	:type IslandPath: boolean
	:type install_path: string 
	:type debug: boolean	

	.. seealso:: This function depends on several ``BacterialTyper`` functions:

		- :func:`BacterialTyper.config.set_config.check_python_packages`

		- :func:`BacterialTyper.config.set_config.check_perl_packages`

		- :func:`BacterialTyper.config.extern_progs.return_min_version_soft`

		- :func:`BacterialTyper.config.extern_progs.print_dependencies`

	"""

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Pipeline Configuration")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    if (options.install_path):
        if os.path.isdir(options.install_path):
            if (Debug):
                print(
                    "Installation path provided for missing modules, packages, dependencies..."
                )
                print("Path: " + options.install_path)
        else:
            print(colored("\n*** ERROR ****", 'red'))
            print(colored("Path provided is not a folder", 'red'))
            print(options.install_path)
            exit()
    else:
        ## get python environment path
        env_bin_directory = os.path.dirname(os.environ['_'])

        ##os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'templates'))
        options.install_path = os.path.abspath(
            os.path.join(env_bin_directory, '../software'))

        if (Debug):
            print("Retrieve environment path as installation path:")
            print("Path: " + options.install_path)

        HCGB_files.create_folder(options.install_path)

    #######################
    ## install or only check
    #######################
    option_install = False
    if (options.option == 'install'):
        print("\n+ Check dependencies")
        print(
            "+ Try to install all missing dependencies, modules or third party software..."
        )
        option_install = True

        ## check if access and permission
        if os.path.isdir(options.install_path):
            if (set_config.access_check(options.install_path, mode=os.F_OK)):
                print(
                    "Installation path is accessible and has permission for installation if necessary"
                )
            else:
                print(colored("\n*** ERROR ****", 'red'))
                print(
                    colored(
                        "No access/permission for this path: %s" %
                        options.install_path, 'red'))
                print(
                    colored(
                        "Please provide a valid path with access/permission to install any missing dependencies.",
                        'red'))
                exit()
        else:
            print(colored("\n*** ERROR ****", 'red'))
            print(colored("Path provided is not a folder", 'red'))
            print(options.install_path)
            exit()

    elif (options.option == 'only_check'):
        print(
            "\nCheck dependencies, modules or third party software and print report..."
        )

    #######################
    ## python version
    #######################
    HCGB_aes.print_sepLine("+", 20, False)
    print('Python:')
    HCGB_aes.print_sepLine("+", 20, False)

    this_python_version = str(sys.version)
    python_min_version = extern_progs.return_min_version_soft('python')
    if LooseVersion(this_python_version) >= LooseVersion(python_min_version):
        print(
            colored(
                "Minimum version (%s) satisfied: %s" %
                (python_min_version, this_python_version), 'green'))
    else:
        print(
            colored(
                "Minimum version (%s) not satisfied: %s" %
                (python_min_version, this_python_version), 'red'))
        exit()

    #######################
    ## perl_version
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 50, False)
    print('Perl:')
    HCGB_aes.print_sepLine("+", 50, False)

    perl_min_version = extern_progs.return_min_version_soft('perl')
    this_perl_path = set_config.get_exe("perl", Debug)
    this_perl_version = set_config.get_version("perl", this_perl_path, Debug)
    if LooseVersion(this_perl_version) >= LooseVersion(perl_min_version):
        print(
            colored(
                "Minimum version (%s) satisfied: %s" %
                (perl_min_version, this_perl_version), 'green'))
    else:
        print(
            colored(
                "Minimum version (%s) not satisfied: %s" %
                (perl_min_version, this_perl_version), 'red'))
        exit()

    #######################
    ## third-party software
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('External dependencies:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_dependencies(option_install, options.install_path, Debug)
    print('\n')

    #######################
    ## python packages
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('Python packages:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_python_packages(Debug, option_install,
                                     options.install_path)
    HCGB_aes.print_sepLine("+", 20, False)
    print('\n')

    #######################
    ## perl packages
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('Perl packages:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_perl_packages("perl_dependencies", Debug, option_install,
                                   options.install_path)
    HCGB_aes.print_sepLine("+", 20, False)
    print('\n')

    #######################
    ## IslandPath dependencies
    #######################
    if (options.IslandPath):
        print('\n')
        HCGB_aes.print_sepLine("+", 20, False)
        print('IslandPath packages and software required:')
        HCGB_aes.print_sepLine("+", 20, False)

        set_config.check_IslandPath(Debug, option_install,
                                    options.install_path)
        HCGB_aes.print_sepLine("+", 20, False)
        print('\n')

    #######################
    ## R packages
    #######################
    print('\n')
    HCGB_aes.print_sepLine("+", 20, False)
    print('R packages:')
    HCGB_aes.print_sepLine("+", 20, False)

    set_config.check_R_packages(option_install, options.install_path, Debug)
    HCGB_aes.print_sepLine("+", 20, False)
    print('\n')
Esempio n. 13
0
def update_database_user_data(database_folder, project_folder, Debug, options):
    """
	Updates user_data folder within the database folder provided.
	
	It would generate single subfolder for each sample previously analyzed and it would store main information and result files for later interpretation, comparison and/or summarization with new samples analyzed.
	
	:param database_folder:
	:param project_folder:
	:param Debug:
	:param options:
	
	:type database_folder:
	:type project_folder:
	:type Debug:
	:type options:
	
	:returns: Updated database result from :func:`BacterialTyper.scripts.database_generator.update_db_data_file`.
	:rtype: Dataframe
	
	:warnings: Returns **FAIL** if check process failed.
	
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.files_functions.create_subfolder`
		
		- :func:`HCGB.functions.main_functions.functions.get_data`
		
		- :func:`HCGB.functions.main_functions.optimize_threads`
		
		- :func:`BacterialTyper.scripts.database_user.get_userData_files`
		
		- :func:`BacterialTyper.scripts.database_user.update_sample`
		
		- :func:`BacterialTyper.scripts.database_generator.getdbs`
		
		- :func:`BacterialTyper.scripts.database_generator.get_database`
		
		- :func:`BacterialTyper.scripts.database_generator.update_db_data_file`

	"""

    print("\n+ Updating information from user data folder: ", project_folder)

    ## create folder
    own_data = HCGB_files.create_subfolder("user_data", database_folder)

    ## Default missing options
    options.project = True
    options.debug = Debug
    if not options.single_end:
        options.pair = True

    ####################################
    ## get information
    ####################################

    ## get user data files
    project_data_df = get_userData_files(options, project_folder)

    ## get user data info
    project_info_df = get_userData_info(options, project_folder)

    ## merge data
    project_all_data = pd.concat([project_data_df, project_info_df],
                                 join='outer',
                                 sort=True).drop_duplicates()
    #project_all_data.index.name = 'name'

    ## debug messages:
    if Debug:
        HCGB_aes.debug_message("project_data_df", 'yellow')
        print(project_data_df)

        HCGB_aes.debug_message("project_info_df", 'yellow')
        print(project_info_df)

        HCGB_aes.debug_message("project_all_data", 'yellow')
        print(project_all_data)

    print('\n+ Get database information')
    db_frame = database_generator.getdbs('user_data', database_folder,
                                         'user_data', Debug)
    user_data_db = database_generator.get_database(db_frame, Debug)

    ## merge dataframe
    sample_frame = project_all_data.groupby("name")

    ####################################
    ## optimize threads
    ####################################
    name_list = project_all_data.index.values.tolist()
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print('\n+ Updating information using %s threads and %s parallel jobs' %
          (options.threads, max_workers_int))

    ####################################
    ## loop through frame using multiple threads
    ####################################
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        ## send for each
        commandsSent = {
            executor.submit(update_sample, name, cluster, own_data,
                            user_data_db, Debug): name
            for name, cluster in sample_frame
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    HCGB_aes.print_sepLine("+", 75, False)
    print("\n+ Retrieve information ...")

    ####################################
    ###### populate dataframe
    ####################################
    for name, cluster in sample_frame:
        ###### dump to file
        info_file = own_data + '/' + name + '/info.txt'
        if os.path.exists(info_file):
            dataGot = HCGB_main.get_data(info_file, ',', 'index_col=0')
            dataGot = dataGot.set_index('ID')

            if (options.debug):
                print(colored("**DEBUG: dataGot dataframe **", 'yellow'))
                print(dataGot)

            user_data_db = pd.concat([user_data_db, dataGot],
                                     join='outer',
                                     sort=True).drop_duplicates()
            ## concatenating by outer we get all available entries

    if (options.debug):
        print(colored("**DEBUG: user_data_db dataframe **", 'yellow'))
        print(user_data_db)

    HCGB_aes.print_sepLine("+", 75, False)

    ####################################
    ## update db
    ####################################
    database_csv = own_data + '/user_database.csv'

    dataUpdated = database_generator.update_db_data_file(
        user_data_db, database_csv)
    print("+ Database has been generated: \n", database_csv)
    return (dataUpdated)
def NCBI_DB(strains2get, data_folder, Debug):
    """Donwloads given taxa from NCBI if not available and updates database information.
	
	This function checks in the given folder if strain of interest is available. If not it would connect to NCBI using python module ncbi_genome_download and downloads some information.
	
	:param strains2get: dataframe containing genus, species and NCBI assembly columns among others. See example below.
	:param data_folder: Absolute path to database NCBI folder.
	:param Debug: Print messages for debugging purposes if desired. 
	:type strains2get: dataframe
	:type data_folder: string
	:type Debug: bool
	:return: Dataframe of genbank database updated for all available entries.

	Columns for the dataframe :file:`strains2get` consist of:
	
	sample,genus,species,strain,BioSample,genome,Plasmids
 
	See and example in file: :file:`/devel/results/strains2get_NCBI_DB.csv` and shown here:
	
	.. include:: ../../devel/results/strains2get_NCBI_DB.csv
		:literal:
		
	See example of the return dataframe, containing database information updated in file: :file:`/devel/results/genbank_database.csv` here:
	
	.. include:: ../../devel/results/genbank_database.csv
		:literal:
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.file_funtcions.create_folder`
	
		- :func:`HCGB.functions.main_functions.get_data`
	
		- :func:`BacterialTyper.scripts.database_generator.get_dbs`
	
		- :func:`BacterialTyper.scripts.database_generator.get_database`
		
		- :func:`BacterialTyper.scripts.database_generator.NCBIdownload`
		
		- :func:`BacterialTyper.scripts.database_generator.update_db_data_file`
		
	.. include:: ../../links.inc	 	
	
	"""

    ## set index
    strains2get = strains2get.set_index(
        'NCBI_assembly_ID', drop=False)  ## set new index but keep column
    strains2get.index.names = ['ID']  ## rename index
    strains2get = strains2get.drop_duplicates()

    #########
    if Debug:
        print(colored("DEBUG: NCBI data provided: ", 'yellow'))
        print(strains2get)

    ## get data existing database
    print("+ Create the database in folder: \n", data_folder)
    HCGB_files.create_folder(data_folder)

    ## read database
    db_frame = getdbs('NCBI', data_folder, 'genbank', Debug)
    database_df = get_database(db_frame, Debug)

    #########
    if Debug:
        print(colored("DEBUG: NCBI genbank database retrieved: ", 'yellow'))
        print("db_frame")
        print(db_frame)
        print()

        print("database_df")
        print(database_df)

    ## loop and download
    for index, row in strains2get.iterrows():
        HCGB_aes.print_sepLine("+", 75, False)
        acc_ID = index  #strains2get.loc[index]['NCBI_assembly_ID']
        info = "Genus: " + strains2get.loc[index][
            'genus'] + '\n' + "Species: " + strains2get.loc[index][
                'species'] + '\n' + "Strain: " + strains2get.loc[index][
                    'name'] + '\n' + "ID accession: " + acc_ID + '\n'
        dir_path = data_folder + '/genbank/bacteria/' + acc_ID  ## module ngd requires to download data in bacteria subfolder under genbank folder

        ## check if already exists
        if acc_ID in database_df.index:
            print("\n+ Data is already available in database for: ")
            print(colored(info, 'green'))

        else:
            ## download
            print("\n+ Downloading data for:")
            print(colored(info, 'green'))
            data_accID = NCBIdownload(acc_ID, strains2get, data_folder)
            this_db = HCGB_main.get_data(data_accID, ',', 'index_col=0')
            this_db = this_db.set_index('ID')
            database_df = database_df.append(this_db)

    ## Generate/Update database
    database_csv = data_folder + '/genbank_database.csv'
    db_updated = update_db_data_file(database_df, database_csv)
    print("+ Database has been generated in file: ", database_csv)
    return (db_updated)
Esempio n. 15
0
def parse_options(arg_dict):

    outdir = os.path.abspath(arg_dict.output_folder)

    ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options
    ## ATTENTION: df_accID merge generated dataframe

    ## --------------------------------------- ##
    ## GFF or GBF file
    ## --------------------------------------- ##
    if (arg_dict.annot_file):
        arg_dict.annot_file = os.path.abspath(arg_dict.annot_file)

        # *************************** ##
        ## multiple files provided
        # *************************** ##
        if (arg_dict.batch):
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple annotation file provided option:',
                              'yellow')
                debug_message('arg_dict.annot_file: ' + arg_dict.annot_file,
                              'yellow')

            ## check if ok
            BacDup_functions.file_readable_check(arg_dict.annot_file)

            print(
                colored('\t* Multiple annotation files provided .......[OK]',
                        'green'))
            dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',')

            ## debug messages
            if (arg_dict.debug):
                debug_message('dict_entries: ', 'yellow')
                debug_message(dict_entries, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++\n\n')

        # *************************** ##
        ## single file provided
        # *************************** ##
        else:
            dict_entries = {}
            print(colored('\t* Annotation file:.......[OK]', 'green'))
            if (arg_dict.sample_name):
                sample_name = arg_dict.sample_name
            else:
                sample_name = "sample"

            ##
            dict_entries[sample_name] = arg_dict.annot_file

        ## create dataframe df_accID to match other formats
        df_accID = pd.DataFrame(
            columns=(BacDup_functions.columns_accID_table()))

        for name, file_annot in dict_entries.items():
            file_annot = os.path.abspath(file_annot)

            ## init all
            genome = ""
            prot = ""
            gff = ""
            gbk = ""
            plasmid_count = ""
            plasmid_id = ""

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message(
                    'dict_entries check annotation files provided option:',
                    'yellow')
                debug_message('name: ' + name, 'yellow')
                debug_message('file_annot: ' + file_annot, 'yellow')

            ## check file is valid
            BacDup_functions.file_readable_check(file_annot)

            ## get format
            format = format_checker.is_format(file_annot, arg_dict.debug)

            if (arg_dict.debug):
                debug_message('format: ' + format, 'yellow')

            ## parse accordingly
            taxonomy = ""
            organism = ""
            taxonomy_string = ""
            genus = ""
            if (format == 'gbk'):
                ## get information from each sample
                (taxonomy,
                 organism) = BacDup.scripts.functions.get_gbk_information(
                     file_annot, arg_dict.debug)
                ## plasmid_count, plasmid_id not available

            elif (format == 'gff'):
                if (arg_dict.ref_file):
                    arg_dict.ref_file = os.path.abspath(arg_dict.ref_file)
                    BacDup_functions.file_readable_check(arg_dict.ref_file)

                    if (arg_dict.batch):
                        ref_entries = HCGB_main.file2dictionary(
                            arg_dict.ref_file, ',')
                        genome = ref_entries[name]
                    else:
                        genome = arg_dict.ref_file

            ## save into dataframe
            if len(taxonomy) > 1:
                genus = taxonomy[-1]
                taxonomy_string = ";".join(taxonomy)

            dir_path = os.path.abspath(os.path.dirname(file_annot))
            df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism,
                                           taxonomy_string, genome, file_annot,
                                           format, prot, plasmid_count,
                                           ";".join(plasmid_id))

    ## --------------------------------------- ##
    ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1
    ## --------------------------------------- ##
    elif (arg_dict.GenBank_id):
        ## get database path
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder(
                "db", os.path.abspath(arg_dict.output_folder))

        ## debug messages
        if (arg_dict.debug):
            debug_message('+++++++++++++++++++++++++++++++')
            debug_message('GenBank ID option:', 'yellow')
            debug_message('db_folder: ' + db_folder, 'yellow')

        # *************************** ##
        ## batch file
        # *************************** ##
        if (arg_dict.batch):
            arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id)

            ## debug messages
            if (arg_dict.debug):
                debug_message('GenBank ID batch file provided:', 'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.GenBank_id)

            print(
                colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]',
                        'green'))
            print()

            ## call IDs into a list and create tmp folder
            strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id)
            strains2get = list(filter(None, strains2get))

            ## debug messages
            if (arg_dict.debug):
                debug_message('strains2get: ' + str(strains2get), 'yellow')

            ## call NCBI_downloader
            df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
                strains2get, db_folder, arg_dict.debug,
                arg_dict.assembly_level)

        # *************************** ##
        ## single GenBank ID
        # *************************** ##
        else:
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Single NCBI GenBank IDs provided option:',
                              'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')
                debug_message('db_folder: ' + db_folder, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++')

            ## download
            print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green'))
            print()
            HCGB_aes.print_sepLine("+", 75, False)
            df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload(
                arg_dict.GenBank_id, db_folder, arg_dict.debug)

    ## --------------------------------------- ##
    ## NCBI Taxonomy ID:
    ## --------------------------------------- ##
    elif (arg_dict.tax_id):
        #################
        ## get tax ids
        #################
        if (arg_dict.batch):
            print(
                colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]',
                        'green'))

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple NCBI Taxonomy IDs provided option:',
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.tax_id)

            ## get IDs into a list
            taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id)

        else:
            print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green'))
            taxIDs2get = [arg_dict.tax_id]

        print()

        ##################################
        ## init ete NCBI taxonomy database
        ##################################
        print('+ Initiate NCBI taxonomy database...')
        ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug)

        string_info_total = []
        for taxid in taxIDs2get:
            ## parse
            info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel',
                                                  arg_dict.debug)
            print()

            ## debug messages
            if arg_dict.debug:
                debug_message(
                    "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                )
                debug_message('info\n', "yellow")
                print(info)

            ## append if more
            string_info_total.extend(info)

        ## convert to list of strings
        string_info_total = [str(int) for int in string_info_total]

        ## assume all belong to same superkingdom if children of same tax_id
        group_obtained = taxonomy_retrieval.get_superKingdom(
            string_info_total[0], ncbi, arg_dict.debug)

        #################
        ## get database path
        #################
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder("db", outdir)

        ## debug messages
        if arg_dict.debug:
            debug_message(
                "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            )
            debug_message('group_obtained: ' + group_obtained, "yellow")
            debug_message('db_folder: ' + db_folder, "yellow")
            debug_message(
                'arg_dict.assembly_level: ' + arg_dict.assembly_level,
                "yellow")
            debug_message('arg_dict.section: ' + arg_dict.section, "yellow")

        ##################################
        ## get GenBank entries selected
        ##################################
        (strains2get,
         allstrains_available) = taxonomy_retrieval.get_GenBank_ids(
             db_folder,
             string_info_total,
             int(arg_dict.k_random),
             arg_dict.debug,
             assembly_level_given=arg_dict.assembly_level,
             group_given=group_obtained,
             section_given=arg_dict.section)

        ## print list and dictionary of possible and selected taxIDs
        outdir = os.path.abspath(arg_dict.output_folder)
        info_dir = HCGB_files.create_subfolder("info", outdir)
        input_info_dir = HCGB_files.create_subfolder("input", info_dir)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'all_entries.txt'),
            allstrains_available)

        ## save into file
        file_info = os.path.join(input_info_dir, 'info.txt')

        ## stop here if dry_run
        if arg_dict.dry_run:
            print()
            HCGB_aes.print_sepLine("*", 75, False)
            print(
                "ATTENTION: Dry run mode selected. Stopping the process here.")
            HCGB_aes.print_sepLine("*", 75, False)
            print("+ All available entries listed and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'all_entries.txt'))
            print("+ Subset of entries generated and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'Downloaded.txt'))
            print(
                "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n"
            )
            HCGB_aes.print_sepLine("*", 75, False)
            print()
            exit()

        #################
        ## call NCBI_downloader
        #################
        df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
            strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level)

    ## --------------------------------------- ##
    ## Previous BacDup analysis folder
    ## --------------------------------------- ##
    ## TODO
    elif (arg_dict.project):
        print(
            colored(
                '\t* A previous BacDup analysis project folder:.......[OK]',
                'green'))
        ## create df_accID to store data
        ## TODO

    ## Returns dataframe with information

    df_accID = df_accID.set_index('new_name')
    return (df_accID)
Esempio n. 16
0
def ARIBA_ident(options, pd_samples_retrieved, outdir_dict, retrieve_databases,
                start_time_partial):
    HCGB_aes.boxymcboxface("ARIBA Identification")

    ##################
    ## check status	##
    ##################
    databases2use = []  ## path, db name
    card_trick_info = ""
    print('+ Check databases status: ')
    for index, db2use in retrieve_databases.iterrows():
        ## index_name
        if (db2use['source'] == 'ARIBA'):
            index_status = ariba_caller.check_db_indexed(db2use['path'], 'YES')
            if (index_status == True):
                #print (colored("\t+ Databases %s seems to be fine...\n\n" % db2use['db'], 'green'))
                databases2use.append([db2use['path'], db2use['db']])

                ## prepare card database ontology for later
                if (db2use['db'] == 'card'):
                    card_trick_info = card_trick_caller.prepare_card_data(
                        options.database)

        ## check status of other databases if any
        # else:

    ## debug message
    if (Debug):
        print(colored("**DEBUG: databases2use\n**", 'yellow'))
        print(databases2use)
        if (card_trick_info):
            print(
                colored("**DEBUG: card_trick_info: " + card_trick_info + " **",
                        'yellow'))

    ######################################################
    ## Start identification of samples
    ######################################################
    print("\n+ Send ARIBA identification jobs...")

    ## get outdir folders
    outdir_samples = pd.DataFrame(columns=('sample', 'dirname', 'db',
                                           'output'))

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    for name, cluster in sample_frame:
        for db2use in databases2use:
            tmp = get_outfile(outdir_dict[name], name, db2use[0])
            outdir_samples.loc[len(outdir_samples)] = (name, outdir_dict[name],
                                                       db2use[1], tmp)

    ## multi-index
    outdir_samples = outdir_samples.set_index(['sample', 'db'])

    ## debug message
    if (Debug):
        print(colored("**DEBUG: outdir_samples **", 'yellow'))
        print(outdir_samples)

    ######################################################
    ## send for each sample
    ######################################################
    ## ariba assembly cutoff
    if not (options.ARIBA_cutoff):
        options.ARIBA_cutoff = 0.90

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## loop
    results_df = pd.DataFrame()
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        for db2use in databases2use:
            print(colored("+ Working with database: " + db2use[1], 'yellow'))
            ## send for each sample
            commandsSent = {
                executor.submit(
                    ariba_run_caller,
                    db2use[0],
                    db2use[1],  ## database path & dbname
                    sorted(cluster["sample"].tolist()),  ## files
                    outdir_samples.loc[(name, db2use[1]), 'output'],  ## output
                    threads_job,
                    options.ARIBA_cutoff): name
                for name, cluster in sample_frame
            }

            for cmd2 in concurrent.futures.as_completed(commandsSent):
                details = commandsSent[cmd2]
                try:
                    data = cmd2.result()
                except Exception as exc:
                    print('***ERROR:')
                    print(cmd2)
                    print('%r generated an exception: %s' % (details, exc))

            print("+ Jobs finished for database %s ..." % db2use[1])

            ## functions.timestamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

            print()
            print(
                "+ Collecting information for each sample analyzed for database: "
                + db2use[1])
            ## check results for each database
            results_df_tmp = virulence_resistance.check_results(
                db2use[1], outdir_samples, options.ARIBA_cutoff,
                card_trick_info)
            results_df = pd.concat([results_df, results_df_tmp])

            ## functions.timestamp
            start_time_partial = HCGB_time.timestamp(start_time_partial)

    ######################################################
    ## Generate final report for all samples
    ######################################################
    ## ariba summary results all samples
    print(
        "\n + Generate a summary file for all samples and one for each database employed..."
    )

    ## parse results
    if Project:
        final_dir = input_dir + '/report/profile'
        HCGB_files.create_folder(final_dir)
    else:
        final_dir = os.path.abspath(options.output_folder)

    ##
    vfdb = False
    subfolder = HCGB_files.create_subfolder("ariba_summary", final_dir)
    ## subfolder_samples = functions.create_subfolder("samples", final_dir) ## TODO: Copy all xlsx files to a common folder. Is it necessary?

    ## open excel writer
    name_excel = final_dir + '/profile_summary.xlsx'
    writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')

    for database, data in outdir_samples.groupby(level='db'):  ## fix
        report_files_databases = {}

        for sample, data2 in data.groupby(level='sample'):  ## fix
            file_report = data2.loc[sample, database]['output'] + '/report.tsv'
            if os.path.isfile(file_report):  ## check if exists
                report_files_databases[sample] = file_report

        outfile_summary = subfolder + "/"
        if database.endswith('card_prepareref/'):
            outfile_summary = outfile_summary + 'CARD_summary'
            name_db = 'CARD'
        elif database.endswith('vfdb_full_prepareref/'):
            outfile_summary = outfile_summary + 'VFDB_summary'
            name_db = 'VFDB'
            vfdb = True
        else:
            ## TODO: check if there are multiple 'other' databases
            ## Different databases provided (different to VFDB and CARD) would collapse file
            outfile_summary = outfile_summary + 'Other_summary'
            name_db = 'other'

        ## call ariba summary to summarize results
        csv_all = ariba_caller.ariba_summary_all(outfile_summary,
                                                 report_files_databases)
        if not csv_all == 'NaN':
            csv2excel = pd.read_csv(csv_all, header=0, sep=',')
            ## write excel
            name_tab = name_db + '_found'
            csv2excel.to_excel(writer, sheet_name=name_tab)

    ## results_df contains excel and csv files for each sample and for each database
    list_databases = set(results_df['database'].to_list())
    for db in list_databases:
        df_db = results_df[results_df['database'] == db]['csv']
        dict_samples = df_db.to_dict()

        merge_df = pd.DataFrame()
        for sample in dict_samples:

            if os.path.isfile(dict_samples[sample]):
                df = pd.read_csv(dict_samples[sample], header=0, sep=",")
                df = df.set_index('Genes')
                df2 = df.rename(columns={'Status': sample}, inplace=True)
                df2 = df[[sample]]

                ## add to a common dataframe
                merge_df = pd.concat([merge_df, df2], axis=1, sort=True)
                merge_df.fillna("NaN", inplace=True)

        trans_df = merge_df.transpose()
        ## write excel
        name_tab = db + '_all'
        trans_df.to_excel(writer, sheet_name=name_tab)

    ## close
    writer.save()

    ######################################################
    ## print additional information for VFDB
    ######################################################
    if (vfdb):
        print("\n\n")
        HCGB_aes.print_sepLine("*", 50, False)
        print("+ Check VFDB details in files downloaded from vfdb website:")
        files_VFDB = virulence_resistance.check_VFDB(final_dir +
                                                     '/VFDB_information')
        HCGB_aes.print_sepLine("*", 50, False)

    ######################################################
    print("\n+ Please check additional summary files generated at folder ",
          final_dir)
    print("+ Go to website: https://jameshadfield.github.io/phandango/#/")
    print(
        "+ For each database upload files *phandango.csv and *phandango.tre and visualize results"
    )
Esempio n. 17
0
def get_userData_files(options, project_folder):
    ## get information regarding files

    ## get trimmed ngs files
    print()
    HCGB_aes.print_sepLine("-", 60, 'yellow')
    print("+ Retrieve trimmed reads information:")
    pd_samples_reads = sampleParser.files.get_files(options, project_folder,
                                                    "trim", ['_trim'],
                                                    options.debug)
    pd_samples_reads = pd_samples_reads.set_index('name')
    HCGB_aes.print_sepLine("-", 60, 'yellow')

    ## get assembly files
    print()
    HCGB_aes.print_sepLine("-", 60, 'yellow')
    print("+ Retrieve assembly information:")
    pd_samples_assembly = sampleParser.files.get_files(options, project_folder,
                                                       "assembly", ["fna"],
                                                       options.debug)
    pd_samples_assembly = pd_samples_assembly.set_index('name')
    HCGB_aes.print_sepLine("-", 60, 'yellow')

    ## get annotation files
    print()
    HCGB_aes.print_sepLine("-", 60, 'yellow')
    print("+ Retrieve annotation information:")
    pd_samples_annot = sampleParser.files.get_files(options, project_folder,
                                                    "annot",
                                                    ['gbf', 'faa', 'gff'],
                                                    options.debug)
    pd_samples_annot = pd_samples_annot.set_index('name')
    HCGB_aes.print_sepLine("-", 60, 'yellow')

    ## debug message
    if (options.debug):
        print(colored("**DEBUG: pd_samples_reads **", 'yellow'))
        print(pd_samples_reads)
        print(colored("**DEBUG: pd_samples_assembly **", 'yellow'))
        print(pd_samples_assembly)
        print(colored("**DEBUG: pd_samples_annot **", 'yellow'))
        print(pd_samples_annot)

    ## merge
    df = pd.concat([pd_samples_reads, pd_samples_annot, pd_samples_assembly],
                   sort=True,
                   join='inner').drop_duplicates()
    ## joining by inner we only get common columns among all

    ## debug message
    if (options.debug):
        print(colored("**DEBUG: pd_concat **", 'yellow'))
        print(df)

    ## set new column with name of samples
    df = df.reset_index()

    ## debug message
    if (options.debug):
        print(colored("**DEBUG: pd_concat reset_index**", 'yellow'))
        print(df)
    ##
    return (df)