コード例 #1
0
def BUSCO_check_dataset(folder, name):
	config_file = folder + '/dataset.cfg'
	##	name=bacteria_odb9
	##	species=E_coli_K12
	##	domain=prokaryota
	##	creation_date=2016-11-01
	##	number_of_BUSCOs=148
	##	number_of_species=3663

	if os.path.isdir(folder):
		#print ("+ Checking the integrity of BUSCO dataset in folder: ", folder)
		HCGB_aes.print_sepLine("+", 10, False)
		print ("Statistics for dataset: ")
		HCGB_aes.print_sepLine("+", 10, False)
		if os.path.isfile(config_file):
			list_config = HCGB_main.readList_fromFile(config_file)
			for elem in list_config:
				line = elem.split("=")
				line[0] = line[0].replace("_", " ")
				print (" "+ "\t".join(line))
			
			print()		
			print ("Available in folder: ", folder)
			print (colored("Dataset....[ OK ]\n", 'green'))	
		else:
			print (colored("Dataset....[ FAIL ]\n", 'red'))
			print ("+ Removing dataset to avoid further errors:")
			os.rmdir(folder)	
			return ('FAIL')
コード例 #2
0
def R_package_path_installed():
    """Provides absolute path to file ``R_package.info.txt`` containing path to missing R packages installed"""

    ## check if exists or try to install
    RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'R', 'R_package.info.txt')

    if HCGB_files.is_non_zero_file(RDir_package):
        list = HCGB_main.readList_fromFile(RDir_package)
        return (list[0])
    else:
        path2install = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'R',
            'install_packages')
        HCGB_files.create_folder(path2install)
        return (path2install)
コード例 #3
0
def dead_code():
	## card_prepareref
	rename_info = card_prepareref + '00.rename_info'
	
	## outfile
	outfile = card_prepareref + '00.info_dictionary'
	out_file_handle = open(outfile, 'w')	
	
	## get info
	lines = HCGB_main.readList_fromFile(rename_info)
	for l in lines:
		names = l.split('\t')
		## original name \t ariba_name
		out_file_handle.write(names[1].split('.')[0] + '\t' + names[0].split('.')[0] + '\n')

	out_file_handle.close()	
	return (outfile)
コード例 #4
0
ファイル: cluster.py プロジェクト: HCGB-IGTP/BacterialTyper
def run_cluster(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_Mash):
        ## information for Min Hash Software
        min_hash_caller.helpMash()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Clustering samples")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    if options.reads:
        if options.noTrim:
            ## raw reads
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "fastq",
                ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug)
        else:
            ## trimm reads
            pd_samples_retrieved = sampleParser.files.get_files(
                options, input_dir, "trim", ['_trim'], options.debug)

        ## keep only R1 reads if paired-end
        if options.pair:
            pd_samples_retrieved = pd_samples_retrieved.loc[
                pd_samples_retrieved['read_pair'] == "R1"]

    else:
        ## default
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    # exit if empty
    if pd_samples_retrieved.empty:
        print(
            "No data has been retrieved from the project folder provided. Exiting now..."
        )
        exit()

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)

    ## for each sample
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "mash",
                                            options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: outdir_dict **", 'yellow'))
        print(outdir_dict)

    ## get databases to check
    retrieve_databases = get_options_db(options)

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## remove samples if specified
    if options.ex_sample:
        ex_samples = HCGB_main.get_info_file(options.ex_sample)
        retrieve_databases = retrieve_databases.loc[~retrieve_databases.index.
                                                    isin(ex_samples)]

    ## debug message
    if (Debug):
        print(colored("**DEBUG: retrieve_database **", 'yellow'))
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_columns', None)
        print(retrieve_databases)

    ## check if all samples in user_data or genbank are indexed
    siglist_all = []
    for index, row in retrieve_databases.iterrows():
        if not row['path'] == 'NaN':
            if (Debug):
                HCGB_aes.print_sepLine("*", 25, False)
                print(row)

            if all([
                    int(options.kmer_size) == int(row['ksize']),
                    int(options.n_sketch) == int(row['num_sketch'])
            ]):
                siglist_all.append(
                    min_hash_caller.read_signature(row['path'],
                                                   options.kmer_size))
                continue

        ## index assembly or reads...
        (sigfile, siglist) = generate_sketch(row['folder'], row['original'],
                                             index, options.kmer_size,
                                             options.n_sketch, Debug)
        retrieve_databases.loc[index]['path'] = sigfile
        retrieve_databases.loc[index]['ksize'] = options.kmer_size
        retrieve_databases.loc[index]['num_sketch'] = options.n_sketch
        siglist_all.append(siglist)

    ### Cluster project samples
    print(colored("\n+ Collect project data", 'green'))
    print("+ Generate mash sketches for each sample analyzed...")
    pd_samples_retrieved = pd_samples_retrieved.set_index('name')

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieved **", 'yellow'))
        print(pd_samples_retrieved)

    ## init dataframe for project data
    colname = ["source", "name", "path", "original", "ksize", "num_sketch"]
    pd_samples_sketched = pd.DataFrame(columns=colname)
    for index, row in pd_samples_retrieved.iterrows():
        if index in retrieve_databases.index:
            print(
                colored(
                    '\t+ Sketched signature (%s) available within user data...'
                    % index, 'yellow'))
            continue

        this_sig = outdir_dict[index] + '/' + index + '.sig'
        if os.path.exists(this_sig):
            ## File signature might exist

            ## read original
            file2print = outdir_dict[index] + '/.original'
            if not os.path.exists(file2print):
                original = ['NaN']
            else:
                original = HCGB_main.readList_fromFile(file2print)
                if all([
                        int(options.kmer_size) == int(original[1]),
                        int(options.n_sketch) == int(original[2])
                ]):
                    siglist_all.append(
                        min_hash_caller.read_signature(this_sig,
                                                       options.kmer_size))
                    pd_samples_sketched.loc[len(pd_samples_sketched)] = (
                        'project_data', index, this_sig, row['sample'],
                        options.kmer_size, options.n_sketch)
                    print(
                        colored(
                            '\t+ Sketched signature available (%s) in project folder...'
                            % index, 'green'))
                    continue

        print(
            colored('\t+ Sketched signature to be generated: (%s)...' % index,
                    'yellow'))
        ## index assembly or reads...
        (sigfile, siglist) = generate_sketch(outdir_dict[index], row['sample'],
                                             index, options.kmer_size,
                                             options.n_sketch, Debug)
        pd_samples_sketched.loc[len(pd_samples_sketched)] = ('project_data',
                                                             index, sigfile,
                                                             row['sample'],
                                                             options.kmer_size,
                                                             options.n_sketch)
        siglist_all.append(siglist)

    print("\n+ Clustering sequences...")
    pd_samples_sketched = pd_samples_sketched.set_index('name')

    ####
    if retrieve_databases.empty:
        cluster_df = pd_samples_sketched
    else:
        tmp = retrieve_databases[[
            'source', 'db', 'path', 'original', 'ksize', 'num_sketch'
        ]]
        tmp = tmp.rename(columns={'db': 'name'})
        tmp.set_index('name')

        if (Debug):
            print(colored("**DEBUG: tmp **", 'yellow'))
            print(tmp)

        ## merge both dataframes
        cluster_df = pd.concat([pd_samples_sketched, tmp],
                               join='inner',
                               sort=True)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_sketched **", 'yellow'))
        print(pd_samples_sketched)

        print(colored("**DEBUG: cluster_df **", 'yellow'))
        print(cluster_df)

        print(colored("**DEBUG: Signatures **", 'yellow'))
        print(siglist_all)

        print(colored("**DEBUG: length siglist_all **", 'yellow'))
        print(len(siglist_all))

    ## Assign Colors colorLabels
    color_df = cluster_df.filter(["source"], axis=1)
    color_df["color"] = "r"  ## red::genbank

    ## project data
    project_data = list(color_df[color_df["source"] == "project_data"].index)
    color_df.loc[color_df.index.isin(project_data),
                 "color"] = "g"  ## green::project_data

    ## user_data
    user_data = list(color_df[color_df["source"] == "user_data"].index)
    color_df.loc[color_df.index.isin(user_data),
                 "color"] = "b"  ## blue::user_data

    colorLabels = color_df['color'].to_dict()

    if Debug:
        print(color_df)
        print(colorLabels)

    ## parse results
    if options.project:
        outdir_report = HCGB_files.create_subfolder("report", outdir)
        #final_dir = outdir + '/report/cluster'
        final_dir = functions.create_subfolder("cluster", outdir_report)
    else:
        final_dir = outdir

    ## compare
    name = 'cluster_' + str(HCGB_time.create_human_timestamp())
    tag_cluster_info = final_dir + '/' + name
    print('+ Saving results in folder: ', final_dir)
    print('\tFile name: ', name)
    (DataMatrix, labeltext) = min_hash_caller.compare(siglist_all,
                                                      tag_cluster_info, Debug)

    ## get colorLabels

    ## plot images
    pdf = True
    cluster_returned = min_hash_caller.plot(DataMatrix, labeltext,
                                            tag_cluster_info, pdf, colorLabels)

    ## generate newick tree
    min_hash_caller.get_Newick_tree(cluster_returned, DataMatrix, labeltext,
                                    tag_cluster_info)

    return ()
コード例 #5
0
ファイル: cluster.py プロジェクト: HCGB-IGTP/BacterialTyper
def get_options_db(options):
    ##
    ## Among all databases available and according to the input options,
    ## select the databases to use and set dataframe with this information
    ##

    print("\n\n+ Select databases to use for identification:")

    ### database folder to use
    database2use = os.path.abspath(options.database)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: Database to use: " + database2use + " **",
                    'yellow'))

    ## external file provided: single or batch
    if (options.external_file):
        abs_path_ext_file = os.path.abspath(options.external_file)
        if options.batch_external:
            myList = HCGB_main.readList_fromFile(abs_path_ext_file)
            join_str = ','.join(myList)
        else:
            join_str = abs_path_ext_file

    ############################################################
    ### Options: according to user input: select databases to use
    option_db = ""

    ############
    ## 1) only user data: previously identified and added
    ############
    if (options.user_data):
        option_db = "Mash:user_data"

    ############
    ## 2) only genbank data: previously download from NCBI reference genomes
    ############
    elif (options.genbank_data):
        option_db = "Mash:genbank"

    ############
    ## 3) only project_data
    ############
    elif (options.only_project_data):
        option_db = "Mash:project_data"
        pd_MASH = pd.DataFrame()
        return (pd_MASH)

    ############
    ## 4) only external_data
    ############
    elif (options.only_external_data):
        option_db = "Mash_external_data:" + join_str

    #################
    ## all databases
    #################
    else:
        #############################
        option_db = 'Mash:user_data#Mash:genbank'
        if (options.external_file):
            option_db = option_db + '#Mash_external_data:' + join_str

    ###############
    ### get dbs
    ###############
    print("\n+ Parsing information to retrieve databases")
    print("+ Reading from database: " + database2use)
    HCGB_aes.print_sepLine("-", 50, False)

    ###############
    ## debug message
    if (Debug):
        print(colored("**DEBUG: option_db: " + option_db + " **", 'yellow'))

    pd_MASH = database_generator.getdbs("MASH", database2use, option_db, Debug)
    HCGB_aes.print_sepLine("-", 50, False)

    ## return both dataframes
    return (pd_MASH)
コード例 #6
0
def getdbs(source, database_folder, option, debug):
    """Get databases available within the folder provided.
	
	:param source: Type of database to search: ARIBA, KMA, NCBI, MLST, user_data
	:param database_folder: Absolute path to database folder.
	:param option: String containing multiple entries separated by '#' that indicate the type of database entries to search within each source type.
	:param debug: True/False for debugging messages.
	
	:type source: string
	:type database_folder: string
	:type option: string
	:type debug: bool
	
	:returns: Dataframe containing absolute paths to the available databases for each type requested. It contains columns for: "source", "db", "path"
		
	e.g.: 	source = KMA
			option = kma:archaea,plasmids,bacteria#kma_external:/path/to/file1,/path/to/file2#user_data#genbank **
			
	e.g.: 	source = NCBI
			option = genbank
	
	"""

    ## init dataframe
    colname = ["source", "db", "path"]
    db_Dataframe = pd.DataFrame(columns=colname)

    ## read folders within database
    if os.path.isdir(database_folder):
        files = os.listdir(database_folder)  ## ARIBA/KMA_db/genbank/user_data
    else:
        return db_Dataframe

    ## debug message
    if (debug):
        print(colored("Folders: " + str(files), 'yellow'))
        print()

    ## user input
    dbs2use = []
    option_list = option.split("#")

    for option_item in option_list:

        ## debug message
        if (debug):
            print(colored("Option item: " + option_item, 'yellow'))

        ###
        dbs2use_tmp = []

        ## kma
        if (option_item.startswith('kma')):
            if (option_item.startswith('kma:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

            elif (option_item.startswith('kma_external:')):
                external = option_item.split(":")[1].split(",")

                ## add to dataframe
                for ext in external:
                    name_ext = os.path.basename(ext)
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_External', name_ext, ext
                    ]

            elif (option_item.startswith('kma_user_data:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

            elif (option_item.startswith('kma_NCBI:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

        ### ARIBA
        elif (option_item.startswith('ARIBA:')):
            dbs2use = option_item.split(":")[1].split(",")

        ### NCBI: genbank
        elif (option_item.startswith('genbank')):
            dbs2use.append('genbank')

        ### NCBI: taxonomy ID
        elif (option_item.startswith('tax_id')):
            dbs2use.append('taxonomy_id')

        ### user_data
        elif (option_item.startswith('user_data')):
            dbs2use.append('user_data')

        ### MLST
        elif (option_item.startswith('MLST')):
            dbs2use_tmp = option_item.split(":")[1].split(",")

        ### Mash
        elif (option_item.startswith('Mash')):
            if (option_item.startswith('Mash_external_data:')):
                external = option_item.split(":")[1].split(",")
                ## add to dataframe
                for ext in external:
                    name_ext = os.path.basename(ext)
                    name_ext_ = name_ext.split('.fna')[0]
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'Mash_external', name_ext_, ext
                    ]
            else:
                dbs2use_tmp = option_item.split(":")[1].split(",")

        ### Other?
        else:
            dbs2use.append(
                option_item
            )  ## add ARIBA, user_data or genbank option if provided

        ## get all
        dbs2use = dbs2use + dbs2use_tmp

    ## debug message
    if (debug):
        print(colored("\ndbs2use:\n\t" + "\n\t".join(dbs2use), 'yellow'))

    ## init dataframe
    #colname = ["source", "db", "path"]
    #db_Dataframe  = pd.DataFrame(columns = colname)

    ###############
    #### ARIBA ####
    ###############
    if (source == 'ARIBA'):
        ### Check if folder exists
        ARIBA_folder = HCGB_files.create_subfolder('ARIBA', database_folder)

        ### get information
        ARIBA_dbs = ariba_caller.get_ARIBA_dbs(dbs2use)  ## get names
        for ariba_db in ARIBA_dbs:
            this_db = os.path.join(ARIBA_folder, ariba_db + '_prepareref')
            if os.path.exists(this_db):
                code_check_db = ariba_caller.check_db_indexed(this_db, 'NO')
                if (code_check_db == True):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'ARIBA', ariba_db, this_db
                    ]
                    print(
                        colored(
                            "\t- ARIBA: including information from database: "
                            + ariba_db, 'green'))
            else:
                print("+ Database: ", ariba_db, " is not downloaded...")
                print("+ Download now:")
                folder_db = HCGB_files.create_subfolder(ariba_db, ARIBA_folder)
                code_db = ariba_caller.ariba_getref(ariba_db, folder_db, debug,
                                                    2)  ## get names
                if (code_db == 'OK'):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'ARIBA', ariba_db, this_db
                    ]
                    print(
                        colored(
                            "\t- ARIBA: including information from database: "
                            + ariba_db, 'green'))

    #############
    #### KMA ####
    #############
    elif (source == 'KMA'):
        ### Check if folder exists
        KMA_db_abs = HCGB_files.create_subfolder('KMA_db', database_folder)
        kma_dbs = os.listdir(KMA_db_abs)

        ## debug message
        if (debug):
            print(colored("Folders KMA_db:" + str(kma_dbs), 'yellow'))

        ### get information
        for db in dbs2use:
            this_db = KMA_db_abs + '/' + db

            ## debug message
            if (debug):
                print(colored("this_db:" + this_db, 'yellow'))

            #### genbank
            if (db == "genbank"):
                ## KMA databases exists
                this_db_file = this_db + '/genbank_KMA'
                if os.path.isfile(this_db_file + '.comp.b'):
                    print(
                        colored(
                            "\t- genbank: including information from different reference strains available.",
                            'green'))  ## include data from NCBI
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_genbank', 'genbank', this_db_file
                    ]

            #### user_data
            elif (db == "user_data"):
                ## KMA databases exists
                this_db_file = this_db + '/userData_KMA'
                if os.path.isfile(this_db_file + '.comp.b'):
                    print(
                        colored(
                            "\t- user_data: including information from user previously generated results",
                            'green'))  ## include user data
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_user_data', 'user_data', this_db_file
                    ]

            ## default KMA databases: bacteria & plasmids
            else:
                ##
                if (db == 'plasmids'):
                    prefix = '.T'
                elif (db == 'viral'):
                    prefix = '.TG'
                else:
                    prefix = '.ATG'

                this_db_file = os.path.join(this_db, db, db + prefix)
                ## debug message
                if (debug):
                    print(colored("this_db_file:" + this_db_file, 'yellow'))

                if os.path.isfile(this_db_file + '.comp.b'):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_db', db, this_db_file
                    ]
                    print(
                        colored(
                            "\t- KMA: including information from database " +
                            db, 'green'))
                else:
                    print(
                        colored("\t**KMA: Database %s was not available." % db,
                                'red'))

                    ## if missing: call download module
                    print("+ Download missing KMA_db (%s) provided" % db)
                    species_identification_KMA.download_kma_database(
                        os.path.join(database_folder, 'KMA_db', db), db, debug)

                    if os.path.isfile(this_db_file + '.comp.b'):
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'KMA_db', db, this_db_file
                        ]
                        print(
                            colored(
                                "\t- KMA: including information from database "
                                + db, 'green'))
                    else:
                        print(
                            colored(
                                "\t**KMA: Database %s was not available." % db,
                                'red'))

    ##############
    #### NCBI ####
    ##############
    elif (source == 'NCBI'):

        ## TODO: get additional information from
        ## info_file = dir_path + '/info.txt'

        ### Check if folder exists
        path_genbank = os.path.join(database_folder, source, 'genbank')
        db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder)

        ### genbank entries downloaded
        if dbs2use[0] == 'genbank':
            ##
            if os.path.exists(path_genbank + '/bacteria'):
                genbank_entries = os.listdir(
                    os.path.join(path_genbank, 'bacteria'))
                for entry in genbank_entries:
                    this_db = os.path.join(path_genbank, 'bacteria', entry)
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'NCBI:genbank', entry, this_db
                    ]

        elif dbs2use[0] == 'tax_id':
            tax_id_entries = db2use_abs

    ###################
    #### user_data ####
    ###################
    elif (source == 'user_data'):
        ### Check if folder exists
        db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder)

        user_entries = os.listdir(db2use_abs)
        for entry in user_entries:
            this_db = db2use_abs + '/' + entry
            db_Dataframe.loc[len(db_Dataframe)] = ['user_data', entry, this_db]

    #################
    #### PubMLST ####
    #################
    elif (source == 'MLST'):
        ### get information
        for db in dbs2use:
            if db == 'PubMLST':
                ### Check if folder exists
                db2use_abs = HCGB_files.create_subfolder(
                    'PubMLST', database_folder)
                list_profiles = os.listdir(db2use_abs)

                for entry in list_profiles:
                    this_db = db2use_abs + '/' + entry
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'MLST', 'PubMLST', entry + ',' + this_db
                    ]
                    print(
                        colored(
                            "\t- MLST: including information from profile: " +
                            entry, 'green'))

            else:
                db_Dataframe.loc[len(db_Dataframe)] = [
                    'MLST', 'user_profile', db
                ]
                print(
                    colored(
                        "\t- MLST: including information from profile provided by user: "******"genbank"):

                ### Check if folder exists
                db2use_abs = database_folder + '/NCBI/genbank/bacteria'
                if os.path.exists(db2use_abs):
                    print(
                        colored(
                            "\n\t- genbank: including information from different reference strains available.",
                            'green'))  ## include data from NCBI
                    genbank_entries = os.listdir(db2use_abs)
                    for entry in genbank_entries:
                        print('\t+ Reading information from sample: ', entry)
                        this_db = db2use_abs + '/' + entry

                        ## get additional information from
                        info_file = this_db + '/info.txt'
                        info_data = pd.read_csv(info_file).set_index('ID')

                        info_data.fillna("NaN", inplace=True)

                        ## get readable name for each strain
                        entry_strain = str(info_data.loc[entry]['name'])

                        if entry_strain == 'NaN':  ## TODO: debug if it works
                            entry_strain = entry
                            print()
                        else:
                            print('\t\t+ Rename into: ', entry_strain)

                        list_msh = HCGB_main.retrieve_matching_files(
                            this_db, '.sig', debug)
                        if (list_msh):
                            ## print original in file
                            file2print = this_db + '/.original'
                            if not os.path.exists(file2print):
                                original = ['NaN']
                            else:
                                original = HCGB_main.readList_fromFile(
                                    file2print)

                            db_Dataframe.loc[len(db_Dataframe)] = [
                                'genbank', entry_strain, list_msh[0],
                                this_db + '/mash/' + original[0], original[1],
                                original[2], this_db
                            ]
                        else:
                            ## index assembly or reads...
                            list_fna = HCGB_main.retrieve_matching_files(
                                this_db, 'genomic.fna', debug)

                            ## not available
                            db_Dataframe.loc[len(db_Dataframe)] = [
                                'genbank', entry_strain, 'NaN', list_fna[0],
                                'NaN', 'NaN', this_db
                            ]

            #### user_data
            elif (db == "user_data"):
                print(
                    colored(
                        "\n\t- user_data: including information from user previously generated results",
                        'green'))  ## include user data
                db2use_abs = HCGB_files.create_subfolder(
                    'user_data', database_folder)
                user_entries = os.listdir(db2use_abs)
                for entry in user_entries:
                    if entry == 'user_database.csv':
                        continue

                    print('\t+ Reading information from sample: ', entry)
                    this_db = db2use_abs + '/' + entry
                    this_mash_db = this_db + '/mash/' + entry + '.sig'
                    if os.path.exists(this_mash_db):
                        ## print original in file
                        file2print = this_db + '/mash/.original'
                        if not os.path.exists(file2print):
                            original = ['NaN', 'NaN', 'NaN']
                        else:
                            original = HCGB_main.readList_fromFile(file2print)

                        ##
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'user_data', entry, this_mash_db,
                            this_db + '/mash/' + original[0], original[1],
                            original[2], this_db + '/mash'
                        ]
                    else:
                        ## not available
                        list_fna = HCGB_main.retrieve_matching_files(
                            this_db + '/assembly', '.fna', debug)
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'user_data', entry, 'NaN', list_fna[0], 'NaN',
                            'NaN', this_db + '/mash'
                        ]

    #### external_data
    ### TODO: Fix this
    mash_bin = ""  #set_config.get_exe('mash')
    if any(name in 'Mash_external'
           for name in db_Dataframe['source'].to_list()):
        print(
            colored(
                "\t- external_data: including information from external data provided by user",
                'green'))  ## include user data
        db_Dataframe = db_Dataframe.set_index("db", drop=False)
        frame = db_Dataframe[db_Dataframe['source'] == 'Mash_external']
        for index, row in frame.iterrows():
            print('\t+ Reading information for file: ', row['db'])
            outfile = row['path'] + '.msh'
            if not os.path.exists(outfile):
                path_file = os.path.dirname(row['path'])
                this_db_file = min_hash_caller.sketch_database([row['path']],
                                                               mash_bin,
                                                               row['path'],
                                                               row['db'],
                                                               path_file)
                HCGB_aes.print_sepLine("*", 50, False)

            db_Dataframe.loc[row['db']] = [
                'Mash_external', row['db'], outfile, row['path']
            ]

    ## index by id
    db_Dataframe = db_Dataframe.set_index("db", drop=False)
    return (db_Dataframe)
コード例 #7
0
def parse_options(arg_dict):

    outdir = os.path.abspath(arg_dict.output_folder)

    ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options
    ## ATTENTION: df_accID merge generated dataframe

    ## --------------------------------------- ##
    ## GFF or GBF file
    ## --------------------------------------- ##
    if (arg_dict.annot_file):
        arg_dict.annot_file = os.path.abspath(arg_dict.annot_file)

        # *************************** ##
        ## multiple files provided
        # *************************** ##
        if (arg_dict.batch):
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple annotation file provided option:',
                              'yellow')
                debug_message('arg_dict.annot_file: ' + arg_dict.annot_file,
                              'yellow')

            ## check if ok
            BacDup_functions.file_readable_check(arg_dict.annot_file)

            print(
                colored('\t* Multiple annotation files provided .......[OK]',
                        'green'))
            dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',')

            ## debug messages
            if (arg_dict.debug):
                debug_message('dict_entries: ', 'yellow')
                debug_message(dict_entries, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++\n\n')

        # *************************** ##
        ## single file provided
        # *************************** ##
        else:
            dict_entries = {}
            print(colored('\t* Annotation file:.......[OK]', 'green'))
            if (arg_dict.sample_name):
                sample_name = arg_dict.sample_name
            else:
                sample_name = "sample"

            ##
            dict_entries[sample_name] = arg_dict.annot_file

        ## create dataframe df_accID to match other formats
        df_accID = pd.DataFrame(
            columns=(BacDup_functions.columns_accID_table()))

        for name, file_annot in dict_entries.items():
            file_annot = os.path.abspath(file_annot)

            ## init all
            genome = ""
            prot = ""
            gff = ""
            gbk = ""
            plasmid_count = ""
            plasmid_id = ""

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message(
                    'dict_entries check annotation files provided option:',
                    'yellow')
                debug_message('name: ' + name, 'yellow')
                debug_message('file_annot: ' + file_annot, 'yellow')

            ## check file is valid
            BacDup_functions.file_readable_check(file_annot)

            ## get format
            format = format_checker.is_format(file_annot, arg_dict.debug)

            if (arg_dict.debug):
                debug_message('format: ' + format, 'yellow')

            ## parse accordingly
            taxonomy = ""
            organism = ""
            taxonomy_string = ""
            genus = ""
            if (format == 'gbk'):
                ## get information from each sample
                (taxonomy,
                 organism) = BacDup.scripts.functions.get_gbk_information(
                     file_annot, arg_dict.debug)
                ## plasmid_count, plasmid_id not available

            elif (format == 'gff'):
                if (arg_dict.ref_file):
                    arg_dict.ref_file = os.path.abspath(arg_dict.ref_file)
                    BacDup_functions.file_readable_check(arg_dict.ref_file)

                    if (arg_dict.batch):
                        ref_entries = HCGB_main.file2dictionary(
                            arg_dict.ref_file, ',')
                        genome = ref_entries[name]
                    else:
                        genome = arg_dict.ref_file

            ## save into dataframe
            if len(taxonomy) > 1:
                genus = taxonomy[-1]
                taxonomy_string = ";".join(taxonomy)

            dir_path = os.path.abspath(os.path.dirname(file_annot))
            df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism,
                                           taxonomy_string, genome, file_annot,
                                           format, prot, plasmid_count,
                                           ";".join(plasmid_id))

    ## --------------------------------------- ##
    ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1
    ## --------------------------------------- ##
    elif (arg_dict.GenBank_id):
        ## get database path
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder(
                "db", os.path.abspath(arg_dict.output_folder))

        ## debug messages
        if (arg_dict.debug):
            debug_message('+++++++++++++++++++++++++++++++')
            debug_message('GenBank ID option:', 'yellow')
            debug_message('db_folder: ' + db_folder, 'yellow')

        # *************************** ##
        ## batch file
        # *************************** ##
        if (arg_dict.batch):
            arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id)

            ## debug messages
            if (arg_dict.debug):
                debug_message('GenBank ID batch file provided:', 'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.GenBank_id)

            print(
                colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]',
                        'green'))
            print()

            ## call IDs into a list and create tmp folder
            strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id)
            strains2get = list(filter(None, strains2get))

            ## debug messages
            if (arg_dict.debug):
                debug_message('strains2get: ' + str(strains2get), 'yellow')

            ## call NCBI_downloader
            df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
                strains2get, db_folder, arg_dict.debug,
                arg_dict.assembly_level)

        # *************************** ##
        ## single GenBank ID
        # *************************** ##
        else:
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Single NCBI GenBank IDs provided option:',
                              'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')
                debug_message('db_folder: ' + db_folder, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++')

            ## download
            print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green'))
            print()
            HCGB_aes.print_sepLine("+", 75, False)
            df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload(
                arg_dict.GenBank_id, db_folder, arg_dict.debug)

    ## --------------------------------------- ##
    ## NCBI Taxonomy ID:
    ## --------------------------------------- ##
    elif (arg_dict.tax_id):
        #################
        ## get tax ids
        #################
        if (arg_dict.batch):
            print(
                colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]',
                        'green'))

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple NCBI Taxonomy IDs provided option:',
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.tax_id)

            ## get IDs into a list
            taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id)

        else:
            print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green'))
            taxIDs2get = [arg_dict.tax_id]

        print()

        ##################################
        ## init ete NCBI taxonomy database
        ##################################
        print('+ Initiate NCBI taxonomy database...')
        ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug)

        string_info_total = []
        for taxid in taxIDs2get:
            ## parse
            info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel',
                                                  arg_dict.debug)
            print()

            ## debug messages
            if arg_dict.debug:
                debug_message(
                    "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                )
                debug_message('info\n', "yellow")
                print(info)

            ## append if more
            string_info_total.extend(info)

        ## convert to list of strings
        string_info_total = [str(int) for int in string_info_total]

        ## assume all belong to same superkingdom if children of same tax_id
        group_obtained = taxonomy_retrieval.get_superKingdom(
            string_info_total[0], ncbi, arg_dict.debug)

        #################
        ## get database path
        #################
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder("db", outdir)

        ## debug messages
        if arg_dict.debug:
            debug_message(
                "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            )
            debug_message('group_obtained: ' + group_obtained, "yellow")
            debug_message('db_folder: ' + db_folder, "yellow")
            debug_message(
                'arg_dict.assembly_level: ' + arg_dict.assembly_level,
                "yellow")
            debug_message('arg_dict.section: ' + arg_dict.section, "yellow")

        ##################################
        ## get GenBank entries selected
        ##################################
        (strains2get,
         allstrains_available) = taxonomy_retrieval.get_GenBank_ids(
             db_folder,
             string_info_total,
             int(arg_dict.k_random),
             arg_dict.debug,
             assembly_level_given=arg_dict.assembly_level,
             group_given=group_obtained,
             section_given=arg_dict.section)

        ## print list and dictionary of possible and selected taxIDs
        outdir = os.path.abspath(arg_dict.output_folder)
        info_dir = HCGB_files.create_subfolder("info", outdir)
        input_info_dir = HCGB_files.create_subfolder("input", info_dir)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'all_entries.txt'),
            allstrains_available)

        ## save into file
        file_info = os.path.join(input_info_dir, 'info.txt')

        ## stop here if dry_run
        if arg_dict.dry_run:
            print()
            HCGB_aes.print_sepLine("*", 75, False)
            print(
                "ATTENTION: Dry run mode selected. Stopping the process here.")
            HCGB_aes.print_sepLine("*", 75, False)
            print("+ All available entries listed and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'all_entries.txt'))
            print("+ Subset of entries generated and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'Downloaded.txt'))
            print(
                "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n"
            )
            HCGB_aes.print_sepLine("*", 75, False)
            print()
            exit()

        #################
        ## call NCBI_downloader
        #################
        df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
            strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level)

    ## --------------------------------------- ##
    ## Previous BacDup analysis folder
    ## --------------------------------------- ##
    ## TODO
    elif (arg_dict.project):
        print(
            colored(
                '\t* A previous BacDup analysis project folder:.......[OK]',
                'green'))
        ## create df_accID to store data
        ## TODO

    ## Returns dataframe with information

    df_accID = df_accID.set_index('new_name')
    return (df_accID)
コード例 #8
0
def generate_db(file_abs_paths, name, fold_name, option, type_option, Debug, kma_bin):
	"""Generate a call to create or update index KMA databases for later kmer identification. 

	:param file_abs_paths: List of absolute paths fasta genome files to include in the database.
	:param name: Database name.
	:param fold_name: Directory path to store database generated.
	:param option: Generate a new database (option = 'new') or add to pre-existing database (option = 'add'). If database exists, automatically adds.
	:param type_option: Index genome fasta files one by one (option_type='single') or using a batch file containing multiple entries (option='batch').
	:param kma_bin:	Binary executable for KMA software 
	:param Debug: True/False for debugging messages.
	
	:type file_abs_paths: list
	:type name: string
	:type fold_name: string
	:type option: string
	:type type_option: string 
	:type kma_bin:
	:type Debug: bool
		
	:returns: Absolute path to database generated
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.readList_fromFile`
		
		- :func:`BacterialTyper.scripts.functions.printList2file`
		
		- :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed`

		- :func:`BacterialTyper.scripts.species_identification_KMA.index_database`
		
	"""

	print ('+ Updating the KMA database: ', name)			

	## check
	if len(file_abs_paths) > 1:
		## read db in fold_name and get index files
		info = fold_name + '/' + name + '.db'

		## 
		lineList = []
		toIndexList = []
		indexedList = []		

		###
		if os.path.exists(info):
			lineList = HCGB_main.readList_fromFile(info)
			option = 'add'

		for f in file_abs_paths:
			baseName = os.path.basename(f)
			
			## check if already index
			if baseName in lineList:
				print (colored('\t+ File %s is already available in database %s' %(baseName, name), 'green'))
				indexedList.append(f)
			else:
				toIndexList.append(f)		
		
		if toIndexList:
			## generate batch and call
			info2 = fold_name + '/.batch_entries.txt'
			HCGB_main.printList2file(info2, toIndexList)
			status = index_database(info2, kma_bin, name, option, fold_name, type_option)
			final_list = set(lineList + toIndexList + indexedList)
			final_list_name = [os.path.basename(f) for f in final_list]
			HCGB_main.printList2file(info, final_list_name)
			count_files = len(toIndexList)
			print ('+ %s samples have been added to the database' %count_files)
		else:
			print ('\n+ No new sequences were added to the database.')
			return (fold_name + '/' + name)			
		
	else:
		file_name = file_abs_paths[0]
		## check if previously indexed
		status = check_db_indexed(file_name, fold_name)
		if (status): #true
			## debug message
			if (Debug):
				print (colored("**DEBUG: Database (%s) is indexed" %file_name + " **", 'yellow'))
			return (file_name)
		else: #false
			## debug message
			if (Debug):
				print (colored("**DEBUG: Database (%s) is not indexed" %file_name + " **", 'yellow'))
			status = index_database(file_name, kma_bin, file_name, option, fold_name, type_option)
	
	## return
	if (status): #true
		return (file_name)
	else:
		return False
コード例 #9
0
def check_db_indexed(index_name, folder):
	"""
	Check the status of a database
	
	:param index_name: Index name for the database
	:param folder: Absolute path of the folder containing the database.
	
	:type index_name: string
	:type folder: string
	
	:returns: True/False for the index status.
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
		
		- :func:`BacterialTyper.scripts.functions.readList_fromFile`
		
		- :func:`BacterialTyper.scripts.functions.get_number_lines`
		
		- :func:`BacterialTyper.scripts.functions.read_time_stamp`

		- :func:`BacterialTyper.scripts.functions.print_time_stamp`
	
	"""
	
	# Each db consist of 5 files with the following extensions: b, comp.b, length.b, seq.b, name
	my_index_list = [".comp.b", ".index.b", ".length.b", ".name", ".seq.b"]

	print ("\t+ Checking if database has been previously indexed...")
	for sufix in my_index_list:
		##print (sufix)
		
		my_file = index_name + sufix
		if os.path.isfile(my_file):
			print ("\t" + my_file + ' exists...')
		else:
			if (sufix == '.index.b'):
				continue
			else:
				return(False)
	
	## check if previously assembled and succeeded
	filename_stamp = folder + '/.success'
	if os.path.isfile(filename_stamp):
		stamp =	HCGB_time.read_time_stamp(filename_stamp)
		print (colored("\tDatabase was generated on: %s" %stamp, 'yellow'))

		## Check if necessary to download again after several months/days
		days_passed = HCGB_time.get_diff_time(filename_stamp)
		print ("\t\t** %s days ago" %days_passed)		
		## download again
		if (days_passed > 60): 
			print ("\t\t** Downloading information again just to be sure...")
			return(False)
	
	## dump in screen
	names = index_name + '.name'
	count = HCGB_main.get_number_lines(names)
	
	print ("\n\t+ Database seems OK and contains several entries (%s):\n" %count)
	if (count > 50):
		print ("\tToo many entries in the database.\n\tCheck file %s for further details." %names)
	else:
		entries = HCGB_main.readList_fromFile(names)
		print (*entries, sep='\n')

	return(True)