Beispiel #1
0
def retrieve_genes_ids_sequences(profile, gene_ID, debug):
    """    
    """
    ## given a profile folder
    if debug:
        HCGB_aes.debug_message('profile: ', 'yellow')
        print (profile)
        HCGB_aes.debug_message('gene_id: ', 'yellow')
        print (gene_ID)
        
    ##
    assembled_genes_list = HCGB_main.retrieve_matching_files(profile, "assembled_genes.fa", debug)
    assembled_genes_list = [s for s in assembled_genes_list if 'ariba.tmp' not in s]
    
    if debug:
        HCGB_aes.debug_message('assembled_genes_list: ', 'yellow')
        print(assembled_genes_list)

    if os.path.isfile(assembled_genes_list[0]):
        for record in SeqIO.parse(assembled_genes_list[0], "fasta"):
            if debug:
                HCGB_aes.debug_message('record.description: ', 'yellow')
                print(record.description)
 
            search_ID = re.search(gene_ID, record.description)
            if (search_ID):
                return (record.id, str(record.seq))

        return('','')
def run_SPADES_assembly(path, file1, file2, sample, SPADES_bin, threads, debug=False):
	"""Generate main assembly using SPADES
	
	- Calls SPADES to assemble reads (using :func:`BacterialTyper.scripts.spades_assembler.SPADES_systemCall`) 
	
	- SPADES generates a file named as *scaffolds.fasta* within the directory provided. This function retrieves path to contigs/scaffolds assembled (using :func:`HCGB_main.retrieve_matching_files`).
	
	- Renames contigs retrieved using sample name (using :func:`BacterialTyper.scripts.spades_assembler.rename_contigs`).
		
	:param path: Absolute path to folder.
	:param file1: Absolute path to fastq reads (R1).
	:param file2: Absolute path to fastq reads (R2).
	:param sample: Sample name or tag to identify sample
	:param SPADES_bin: Binary executable for SPADES assembly software.
	:param threads: Number of CPUs to use.
	:type path: string
	:type file1: string
	:type file2: string
	:type name: string
	:type threads: integer
	:return: Contigs/scaffolds assembled renamed.
	:rtype: string : Path to assembly fasta file.
	:warnings: Returns **FAIL** if assembly process stopped.
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.spades_assembler.SPADES_systemCall`
	
		- :func:`HCGB.functions.main_functions.retrieve_matching_files`
	
		- :func:`HCGB.functions.fasta_functions.rename_fasta_seqs`
	"""
	##print ('+ Running main assembly...')
	options = ''
	message_return = SPADES_systemCall(path, file1, file2, sample, SPADES_bin, options, threads, debug)
	if 	message_return == 'FAIL':	
		print ("\n\n***ERROR: SPADES failed for sample " + sample)
		return ('FAIL')

	scaffolds_retrieved = HCGB_main.retrieve_matching_files(path, "scaffolds.fasta", debug)
	if scaffolds_retrieved == '':	
		print ('\n\n***ERROR: No scaffolds assembly...')
		return ('FAIL')
	
	### Due to limiations with Genbank format, no more thatn 37 characters are supported for 
	### locus tag identification. This might affect later annotation process and subsequent analysis
	### https://github.com/tseemann/prokka/issues/337 
	new_contigs = path + '/' + sample + '_assembly.fna'
	id_conversion_file = HCGB_fasta.rename_fasta_seqs(scaffolds_retrieved[0], sample, new_contigs)
		
	if	id_conversion_file == 'FAIL':	
		print ("\n\n***ERROR: Rename contigs failed for sample " + sample)
		return ('FAIL')
	else:
		print ("+ Name conversion details saved in file " + id_conversion_file)
	
	return (new_contigs)
def run_SPADES_plasmid_assembly(path, file1, file2, sample, SPADES_bin, threads, debug=False):
	"""Generate plasmid assembly using SPADES
	
	- Calls SPADES to assemble plasmids using --plasmid option (using :func:`BacterialTyper.scripts.spades_assembler.SPADES_systemCall`) 
	
	- SPADES generates a file named as *scaffolds.fasta* within the directory provided. This function retrieves path to contigs/scaffolds assembled.
	
	:param path: Absolute path to folder.
	:param file1: Absolute path to fastq reads (R1).
	:param file2: Absolute path to fastq reads (R2).
	:param sample: Sample name or tag to identify sample
	:param SPADES_bin: Binary executable for SPADES assembly software.
	:param threads: Number of CPUs to use.
	:type path: string
	:type file1: string
	:type file2: string
	:type name: string
	:type threads: integer
	:return: Plasmid contigs/scaffolds assembled.
	:rtype: string : Path to assembly fasta file.
	:warnings: Returns **FAIL** if assembly process stopped.
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.spades_assembler.SPADES_systemCall`
	
		- :func:`HCGB_main.retrieve_matching_files`
	"""
	print ('+ Running plasmid assembly...')
	name = sample + '_plasmid'
	options = '--plasmid '
	message_return = SPADES_systemCall(path, file1, file2, name, SPADES_bin, options, threads)

	if 	message_return == 'FAIL':	
		print ("\n\n***ERROR: plasmidSPADES failed for sample " + sample)	
		exit()

	scaffolds_retrieved = HCGB_main.retrieve_matching_files(path + '/' + name, "scaffolds.fasta", debug)
	if scaffolds_retrieved == '':	
		print ('\n\n***ATTENTION: No plasmids assembly...')

	return (scaffolds_retrieved[0])
def getdbs(source, database_folder, option, debug):
    """Get databases available within the folder provided.
	
	:param source: Type of database to search: ARIBA, KMA, NCBI, MLST, user_data
	:param database_folder: Absolute path to database folder.
	:param option: String containing multiple entries separated by '#' that indicate the type of database entries to search within each source type.
	:param debug: True/False for debugging messages.
	
	:type source: string
	:type database_folder: string
	:type option: string
	:type debug: bool
	
	:returns: Dataframe containing absolute paths to the available databases for each type requested. It contains columns for: "source", "db", "path"
		
	e.g.: 	source = KMA
			option = kma:archaea,plasmids,bacteria#kma_external:/path/to/file1,/path/to/file2#user_data#genbank **
			
	e.g.: 	source = NCBI
			option = genbank
	
	"""

    ## init dataframe
    colname = ["source", "db", "path"]
    db_Dataframe = pd.DataFrame(columns=colname)

    ## read folders within database
    if os.path.isdir(database_folder):
        files = os.listdir(database_folder)  ## ARIBA/KMA_db/genbank/user_data
    else:
        return db_Dataframe

    ## debug message
    if (debug):
        print(colored("Folders: " + str(files), 'yellow'))
        print()

    ## user input
    dbs2use = []
    option_list = option.split("#")

    for option_item in option_list:

        ## debug message
        if (debug):
            print(colored("Option item: " + option_item, 'yellow'))

        ###
        dbs2use_tmp = []

        ## kma
        if (option_item.startswith('kma')):
            if (option_item.startswith('kma:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

            elif (option_item.startswith('kma_external:')):
                external = option_item.split(":")[1].split(",")

                ## add to dataframe
                for ext in external:
                    name_ext = os.path.basename(ext)
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_External', name_ext, ext
                    ]

            elif (option_item.startswith('kma_user_data:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

            elif (option_item.startswith('kma_NCBI:')):
                dbs2use_tmp = option_item.split(":")[1].split(",")

        ### ARIBA
        elif (option_item.startswith('ARIBA:')):
            dbs2use = option_item.split(":")[1].split(",")

        ### NCBI: genbank
        elif (option_item.startswith('genbank')):
            dbs2use.append('genbank')

        ### NCBI: taxonomy ID
        elif (option_item.startswith('tax_id')):
            dbs2use.append('taxonomy_id')

        ### user_data
        elif (option_item.startswith('user_data')):
            dbs2use.append('user_data')

        ### MLST
        elif (option_item.startswith('MLST')):
            dbs2use_tmp = option_item.split(":")[1].split(",")

        ### Mash
        elif (option_item.startswith('Mash')):
            if (option_item.startswith('Mash_external_data:')):
                external = option_item.split(":")[1].split(",")
                ## add to dataframe
                for ext in external:
                    name_ext = os.path.basename(ext)
                    name_ext_ = name_ext.split('.fna')[0]
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'Mash_external', name_ext_, ext
                    ]
            else:
                dbs2use_tmp = option_item.split(":")[1].split(",")

        ### Other?
        else:
            dbs2use.append(
                option_item
            )  ## add ARIBA, user_data or genbank option if provided

        ## get all
        dbs2use = dbs2use + dbs2use_tmp

    ## debug message
    if (debug):
        print(colored("\ndbs2use:\n\t" + "\n\t".join(dbs2use), 'yellow'))

    ## init dataframe
    #colname = ["source", "db", "path"]
    #db_Dataframe  = pd.DataFrame(columns = colname)

    ###############
    #### ARIBA ####
    ###############
    if (source == 'ARIBA'):
        ### Check if folder exists
        ARIBA_folder = HCGB_files.create_subfolder('ARIBA', database_folder)

        ### get information
        ARIBA_dbs = ariba_caller.get_ARIBA_dbs(dbs2use)  ## get names
        for ariba_db in ARIBA_dbs:
            this_db = os.path.join(ARIBA_folder, ariba_db + '_prepareref')
            if os.path.exists(this_db):
                code_check_db = ariba_caller.check_db_indexed(this_db, 'NO')
                if (code_check_db == True):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'ARIBA', ariba_db, this_db
                    ]
                    print(
                        colored(
                            "\t- ARIBA: including information from database: "
                            + ariba_db, 'green'))
            else:
                print("+ Database: ", ariba_db, " is not downloaded...")
                print("+ Download now:")
                folder_db = HCGB_files.create_subfolder(ariba_db, ARIBA_folder)
                code_db = ariba_caller.ariba_getref(ariba_db, folder_db, debug,
                                                    2)  ## get names
                if (code_db == 'OK'):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'ARIBA', ariba_db, this_db
                    ]
                    print(
                        colored(
                            "\t- ARIBA: including information from database: "
                            + ariba_db, 'green'))

    #############
    #### KMA ####
    #############
    elif (source == 'KMA'):
        ### Check if folder exists
        KMA_db_abs = HCGB_files.create_subfolder('KMA_db', database_folder)
        kma_dbs = os.listdir(KMA_db_abs)

        ## debug message
        if (debug):
            print(colored("Folders KMA_db:" + str(kma_dbs), 'yellow'))

        ### get information
        for db in dbs2use:
            this_db = KMA_db_abs + '/' + db

            ## debug message
            if (debug):
                print(colored("this_db:" + this_db, 'yellow'))

            #### genbank
            if (db == "genbank"):
                ## KMA databases exists
                this_db_file = this_db + '/genbank_KMA'
                if os.path.isfile(this_db_file + '.comp.b'):
                    print(
                        colored(
                            "\t- genbank: including information from different reference strains available.",
                            'green'))  ## include data from NCBI
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_genbank', 'genbank', this_db_file
                    ]

            #### user_data
            elif (db == "user_data"):
                ## KMA databases exists
                this_db_file = this_db + '/userData_KMA'
                if os.path.isfile(this_db_file + '.comp.b'):
                    print(
                        colored(
                            "\t- user_data: including information from user previously generated results",
                            'green'))  ## include user data
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_user_data', 'user_data', this_db_file
                    ]

            ## default KMA databases: bacteria & plasmids
            else:
                ##
                if (db == 'plasmids'):
                    prefix = '.T'
                elif (db == 'viral'):
                    prefix = '.TG'
                else:
                    prefix = '.ATG'

                this_db_file = os.path.join(this_db, db, db + prefix)
                ## debug message
                if (debug):
                    print(colored("this_db_file:" + this_db_file, 'yellow'))

                if os.path.isfile(this_db_file + '.comp.b'):
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'KMA_db', db, this_db_file
                    ]
                    print(
                        colored(
                            "\t- KMA: including information from database " +
                            db, 'green'))
                else:
                    print(
                        colored("\t**KMA: Database %s was not available." % db,
                                'red'))

                    ## if missing: call download module
                    print("+ Download missing KMA_db (%s) provided" % db)
                    species_identification_KMA.download_kma_database(
                        os.path.join(database_folder, 'KMA_db', db), db, debug)

                    if os.path.isfile(this_db_file + '.comp.b'):
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'KMA_db', db, this_db_file
                        ]
                        print(
                            colored(
                                "\t- KMA: including information from database "
                                + db, 'green'))
                    else:
                        print(
                            colored(
                                "\t**KMA: Database %s was not available." % db,
                                'red'))

    ##############
    #### NCBI ####
    ##############
    elif (source == 'NCBI'):

        ## TODO: get additional information from
        ## info_file = dir_path + '/info.txt'

        ### Check if folder exists
        path_genbank = os.path.join(database_folder, source, 'genbank')
        db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder)

        ### genbank entries downloaded
        if dbs2use[0] == 'genbank':
            ##
            if os.path.exists(path_genbank + '/bacteria'):
                genbank_entries = os.listdir(
                    os.path.join(path_genbank, 'bacteria'))
                for entry in genbank_entries:
                    this_db = os.path.join(path_genbank, 'bacteria', entry)
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'NCBI:genbank', entry, this_db
                    ]

        elif dbs2use[0] == 'tax_id':
            tax_id_entries = db2use_abs

    ###################
    #### user_data ####
    ###################
    elif (source == 'user_data'):
        ### Check if folder exists
        db2use_abs = HCGB_files.create_subfolder(dbs2use[0], database_folder)

        user_entries = os.listdir(db2use_abs)
        for entry in user_entries:
            this_db = db2use_abs + '/' + entry
            db_Dataframe.loc[len(db_Dataframe)] = ['user_data', entry, this_db]

    #################
    #### PubMLST ####
    #################
    elif (source == 'MLST'):
        ### get information
        for db in dbs2use:
            if db == 'PubMLST':
                ### Check if folder exists
                db2use_abs = HCGB_files.create_subfolder(
                    'PubMLST', database_folder)
                list_profiles = os.listdir(db2use_abs)

                for entry in list_profiles:
                    this_db = db2use_abs + '/' + entry
                    db_Dataframe.loc[len(db_Dataframe)] = [
                        'MLST', 'PubMLST', entry + ',' + this_db
                    ]
                    print(
                        colored(
                            "\t- MLST: including information from profile: " +
                            entry, 'green'))

            else:
                db_Dataframe.loc[len(db_Dataframe)] = [
                    'MLST', 'user_profile', db
                ]
                print(
                    colored(
                        "\t- MLST: including information from profile provided by user: "******"genbank"):

                ### Check if folder exists
                db2use_abs = database_folder + '/NCBI/genbank/bacteria'
                if os.path.exists(db2use_abs):
                    print(
                        colored(
                            "\n\t- genbank: including information from different reference strains available.",
                            'green'))  ## include data from NCBI
                    genbank_entries = os.listdir(db2use_abs)
                    for entry in genbank_entries:
                        print('\t+ Reading information from sample: ', entry)
                        this_db = db2use_abs + '/' + entry

                        ## get additional information from
                        info_file = this_db + '/info.txt'
                        info_data = pd.read_csv(info_file).set_index('ID')

                        info_data.fillna("NaN", inplace=True)

                        ## get readable name for each strain
                        entry_strain = str(info_data.loc[entry]['name'])

                        if entry_strain == 'NaN':  ## TODO: debug if it works
                            entry_strain = entry
                            print()
                        else:
                            print('\t\t+ Rename into: ', entry_strain)

                        list_msh = HCGB_main.retrieve_matching_files(
                            this_db, '.sig', debug)
                        if (list_msh):
                            ## print original in file
                            file2print = this_db + '/.original'
                            if not os.path.exists(file2print):
                                original = ['NaN']
                            else:
                                original = HCGB_main.readList_fromFile(
                                    file2print)

                            db_Dataframe.loc[len(db_Dataframe)] = [
                                'genbank', entry_strain, list_msh[0],
                                this_db + '/mash/' + original[0], original[1],
                                original[2], this_db
                            ]
                        else:
                            ## index assembly or reads...
                            list_fna = HCGB_main.retrieve_matching_files(
                                this_db, 'genomic.fna', debug)

                            ## not available
                            db_Dataframe.loc[len(db_Dataframe)] = [
                                'genbank', entry_strain, 'NaN', list_fna[0],
                                'NaN', 'NaN', this_db
                            ]

            #### user_data
            elif (db == "user_data"):
                print(
                    colored(
                        "\n\t- user_data: including information from user previously generated results",
                        'green'))  ## include user data
                db2use_abs = HCGB_files.create_subfolder(
                    'user_data', database_folder)
                user_entries = os.listdir(db2use_abs)
                for entry in user_entries:
                    if entry == 'user_database.csv':
                        continue

                    print('\t+ Reading information from sample: ', entry)
                    this_db = db2use_abs + '/' + entry
                    this_mash_db = this_db + '/mash/' + entry + '.sig'
                    if os.path.exists(this_mash_db):
                        ## print original in file
                        file2print = this_db + '/mash/.original'
                        if not os.path.exists(file2print):
                            original = ['NaN', 'NaN', 'NaN']
                        else:
                            original = HCGB_main.readList_fromFile(file2print)

                        ##
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'user_data', entry, this_mash_db,
                            this_db + '/mash/' + original[0], original[1],
                            original[2], this_db + '/mash'
                        ]
                    else:
                        ## not available
                        list_fna = HCGB_main.retrieve_matching_files(
                            this_db + '/assembly', '.fna', debug)
                        db_Dataframe.loc[len(db_Dataframe)] = [
                            'user_data', entry, 'NaN', list_fna[0], 'NaN',
                            'NaN', this_db + '/mash'
                        ]

    #### external_data
    ### TODO: Fix this
    mash_bin = ""  #set_config.get_exe('mash')
    if any(name in 'Mash_external'
           for name in db_Dataframe['source'].to_list()):
        print(
            colored(
                "\t- external_data: including information from external data provided by user",
                'green'))  ## include user data
        db_Dataframe = db_Dataframe.set_index("db", drop=False)
        frame = db_Dataframe[db_Dataframe['source'] == 'Mash_external']
        for index, row in frame.iterrows():
            print('\t+ Reading information for file: ', row['db'])
            outfile = row['path'] + '.msh'
            if not os.path.exists(outfile):
                path_file = os.path.dirname(row['path'])
                this_db_file = min_hash_caller.sketch_database([row['path']],
                                                               mash_bin,
                                                               row['path'],
                                                               row['db'],
                                                               path_file)
                HCGB_aes.print_sepLine("*", 50, False)

            db_Dataframe.loc[row['db']] = [
                'Mash_external', row['db'], outfile, row['path']
            ]

    ## index by id
    db_Dataframe = db_Dataframe.set_index("db", drop=False)
    return (db_Dataframe)
Beispiel #5
0
def run_annotation(options):

    ## init time
    start_time_total = time.time()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        sampleParser.help_format()
        exit()

    elif (options.help_BUSCO):
        ## information for BUSCO
        BUSCO_caller.print_help_BUSCO()
        exit()

    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()

    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()

    elif (options.help_Prokka):
        ## information for Prokka
        annotation.print_list_prokka()
        exit()

    ## set default
    options.batch = False

    ###
    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Assembly annotation")

    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    project_mode = True
    if (options.detached):
        options.project = False
        project_mode = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ### symbolic links
    print("+ Retrieve all genomes assembled...")

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "assembly", ["fna"], options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)

    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "annot",
                                            options.debug)

    ## annotate
    print("+ Annotate assemblies using prokka:")
    print("\t-Option: kingdom = ", options.kingdom, "; Annotation mode")
    if options.genera == 'Other':
        print(
            "\t-Option: genera = Off; No genus-specific BLAST databases option provided"
        )
    else:
        print("\t-Option: genera = ", options.genera,
              "; Genus-specific BLAST databases option provided")

    print("\t-Option: addgenes; Add 'gene' features for each 'CDS' feature")
    print("\t-Option: addmrna;  Add 'mRNA' features for each 'CDS' feature")
    print("\t-Option: cdsrnaolap;  Allow [tr]RNA to overlap CDS")

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(annot_caller, row['sample'],
                            outdir_dict[row['name']], options, row['name'],
                            threads_job): index
            for index, row in pd_samples_retrieved.iterrows()
        }
        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    ## time stamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get folders
    givenList = [v for v in outdir_dict.values()]
    protein_files = []
    print(
        "+ Detail information for each sample could be identified in separate folders:"
    )
    for folder in givenList:
        print('\t + ', folder)
        protein_files.extend(
            HCGB_main.retrieve_matching_files(folder, '.faa', Debug))

    ### report generation
    if (options.skip_report):
        print("+ No annotation report generation...")
    else:
        ### report generation
        HCGB_aes.boxymcboxface("Annotation report")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        PROKKA_report = HCGB_files.create_subfolder("annotation",
                                                    outdir_report)
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % PROKKA_report)

        ## check if previously report generated
        filename_stamp = PROKKA_report + '/.success'
        done = 0
        if os.path.isdir(PROKKA_report):
            if os.path.isfile(filename_stamp):
                stamp = HCGB_time.read_time_stamp(filename_stamp)
                print(
                    colored(
                        "\tA previous report generated results on: %s" % stamp,
                        'yellow'))
                done = 1

        ## generate report
        if done == 0:
            ## get subdirs generated and call multiQC report module
            multiQC_report.multiQC_module_call(givenList, "Prokka",
                                               PROKKA_report, "-dd 2")
            print(
                '\n+ A summary HTML report of each sample is generated in folder: %s'
                % PROKKA_report)

            ## success stamps
            filename_stamp = PROKKA_report + '/.success'
            stamp = HCGB_time.print_time_stamp(filename_stamp)

    ## time stamp
    start_time_partial_BUSCO = HCGB_time.timestamp(start_time_total)

    ## Check each annotation using BUSCO
    results = qc.BUSCO_check(input_dir, outdir, options,
                             start_time_partial_BUSCO, "proteins")

    ## print to file: results

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    print("+ Exiting Annotation module.")
    return ()
Beispiel #6
0
def run_biotype(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_XICRA.help_fastq_format()
    elif (options.help_project):
        ## information for project
        help_XICRA.project_help()
        exit()
    elif (options.help_RNAbiotype):
        ## information for join reads
        RNAbiotype.help_info()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    aesthetics_functions.pipeline_header('XICRA')
    aesthetics_functions.boxymcboxface("RNA biotype analysis")
    print("--------- Starting Process ---------")
    time_functions.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## set mode: project/detached
    if (options.detached):
        outdir = os.path.abspath(options.output_folder)
        options.project = False
    else:
        options.project = True
        outdir = input_dir

    ## get files
    print('+ Getting files from input folder... ')

    ## get files
    if options.noTrim:
        print('+ Mode: fastq.\n+ Extension: ')
        print("[ fastq, fq, fastq.gz, fq.gz ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
            options.debug)

    else:
        print('+ Mode: trim.\n+ Extension: ')
        print("[ _trim_ ]\n")
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)

        ## Discard if joined reads: use trimmed single-end or paired-end
        pd_samples_retrieved = pd_samples_retrieved[
            pd_samples_retrieved['ext'] != '_joined']

    ## debug message
    if (Debug):
        print(colored("**DEBUG: pd_samples_retrieve **", 'yellow'))
        print(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        files_functions.create_folder(outdir)

    ## for samples
    mapping_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "map", options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_outdir_dict **", 'yellow'))
        print(mapping_outdir_dict)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_total)

    ## optimize threads
    name_list = set(pd_samples_retrieved["new_name"].tolist())
    threads_job = main_functions.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    ##############################################
    ## map Reads
    ##############################################
    start_time_partial = mapReads_module(options, pd_samples_retrieved,
                                         mapping_outdir_dict, options.debug,
                                         max_workers_int, threads_job,
                                         start_time_partial, outdir)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: mapping_results **", 'yellow'))
        print(mapping_results)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    ## for samples
    biotype_outdir_dict = files_functions.outdir_project(
        outdir, options.project, pd_samples_retrieved, "biotype",
        options.debug)

    ## debug message
    if (Debug):
        print(colored("**DEBUG: biotype_outdir_dict **", 'yellow'))
        print(biotype_outdir_dict)

    ## get RNAbiotype information
    RNAbiotype.RNAbiotype_module_call(mapping_results, biotype_outdir_dict,
                                      options.annotation, options.debug,
                                      max_workers_int, threads_job)

    # time stamp
    start_time_partial = time_functions.timestamp(start_time_partial)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print(
            "\n+ Generating a report using MultiQC module for featureCount analysis."
        )
        outdir_report = files_functions.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders:"
        )

        ## call multiQC report module
        givenList = [v for v in biotype_outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            print(
                colored("\n**DEBUG: my_outdir_list for multiqc report **",
                        'yellow'))
            print(my_outdir_list)
            print("\n")

        featureCount_report = files_functions.create_subfolder(
            "featureCount", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "featureCount",
                                           featureCount_report, "-dd 2")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % featureCount_report)

        ### Summarizing RNA biotype information
        biotype_report = files_functions.create_subfolder(
            "biotype", outdir_report)
        single_files_biotype = files_functions.create_subfolder(
            "samples", biotype_report)

        ## results
        dict_files = {}

        for samples in biotype_outdir_dict:
            featurecount_file = os.path.join(biotype_outdir_dict[samples],
                                             'featureCount.out.tsv')
            if files_functions.is_non_zero_file(featurecount_file):
                dict_files[samples] = featurecount_file
            ## copy pdf
            pdf_plot = main_functions.retrieve_matching_files(
                biotype_outdir_dict[samples], '.pdf', options.debug)
            if files_functions.is_non_zero_file(pdf_plot[0]):
                shutil.copy(pdf_plot[0], single_files_biotype)

        ## collapse all information
        all_data = RNAbiotype.generate_matrix(dict_files)

        ## print into excel/csv
        print('+ Table contains: ', len(all_data), ' entries\n')

        ## debugging messages
        if Debug:
            print("** DEBUG: all_data")
            print(all_data)

        ## set abs_csv_outfile to be in report folder
        ## copy or link files for each sample analyzed
        abs_csv_outfile = os.path.join(biotype_report, "summary.csv")
        all_data.to_csv(abs_csv_outfile)

        ## create plot: call R [TODO: implement in python]
        outfile_pdf = os.path.join(biotype_report, "RNAbiotypes_summary.pdf")

        ## R scripts
        biotype_R_script = tools.R_scripts('plot_RNAbiotype_sum',
                                           options.debug)
        rscript = set_config.get_exe("Rscript", options.debug)
        cmd_R_plot = "%s %s -f %s -o %s" % (rscript, biotype_R_script,
                                            abs_csv_outfile, outfile_pdf)

        ##
        print("+ Create summary plot for all samples")
        callCode = system_call_functions.system_call(cmd_R_plot)

    print("\n*************** Finish *******************")
    start_time_partial = time_functions.timestamp(start_time_total)
    print("\n+ Exiting join module.")
    return ()