Python print_all_pandaDF Beispiele

Programmiersprache: Python

Namespace / Paketname: HCGB.functions.main_functions

Methode / Funktion: print_all_pandaDF

Beispiele auf hotexamples.com: 6

Python print_all_pandaDF - 6 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die HCGB.functions.main_functions.print_all_pandaDF, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

def agrvate_caller(dict_assemblies, dict_folders, debug=False):
    """Create agrvate call and control for parameters"""
    
    ## ATTENTION: agrvate needs to chdir to output folder
    path_here = os.getcwd()
    
    print ("+ Checking agr genes for each sample retrieved...")
    
    agrvate_results = pd.DataFrame()
    
    ## No need to optimize. There is a problem with the working dir of agrvate and we 
    ## need to change every time.
    for name, assembly_file in dict_assemblies.items():
        sample_folder = HCGB_files.create_folder(dict_folders[name])
        ## check if previously done and succeeded
        filename_stamp = sample_folder + '/.success'
        if os.path.isfile(filename_stamp):
            stamp =  HCGB_time.read_time_stamp(filename_stamp)
            print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow'))
        else:
            os.chdir(sample_folder)
            info_sample = agrvate_call(name, assembly_file, sample_folder, debug)
            agrvate_results = pd.concat([agrvate_results, info_sample], join='outer')
            
            if (info_sample.shape[0] == 0):
                print("+ Some error occurred with sample %s. Please re-run analysis or check log files." %name)
            else:
                ## success
                HCGB_time.print_time_stamp(filename_stamp)
    
    print ("+ Jobs finished%s\n+ Collecting information for all samples...")
    os.chdir(path_here)
    
    ## debug messages
    if debug:
        HCGB_aes.debug_message('agrvate_results', 'yellow')
        HCGB_main.print_all_pandaDF(agrvate_results)
    
    return(agrvate_results)

Beispiel #2

Datei anzeigen

def parse_search_options(arg_dict):

    ##
    outdir = os.path.abspath(arg_dict.input_folder)

    ## --------------------------------------- ##
    ## Project containing data
    ## --------------------------------------- ##
    if (arg_dict.project):
        print(colored('\t* BacDup project folder:.......[OK]', 'green'))

        ## set missing options
        arg_dict.pair = False
        arg_dict.include_all = True
        arg_dict.include_lane = True

        ## find samples previously parsed and prepared within a BacDup project structure
        pd_proteins = sampleParser.files.get_files(arg_dict, outdir, "parse",
                                                   ["fa"], arg_dict.debug)
        pd_proteins = pd_proteins.drop(["dirname", "name", "ext", "tag"],
                                       axis=1)
        pd_proteins = pd_proteins.rename(index=str,
                                         columns={'sample': 'file_data'})
        pd_proteins['format'] = 'fasta'

        pd_annot = sampleParser.files.get_files(arg_dict, outdir, "parse",
                                                ["annot_df.csv"],
                                                arg_dict.debug)
        pd_annot = pd_annot.drop(["dirname", "name", "ext", "tag"], axis=1)
        pd_annot = pd_annot.rename(index=str,
                                   columns={'sample': 'annot_table'})

        ## merge into pd_samples_retrieved
        pd_samples_retrieved = pd.merge(pd_proteins, pd_annot)

        ## debug messages
        if (arg_dict.debug):
            debug_message('pd_proteins:', 'yellow')
            HCGB_main.print_all_pandaDF(pd_proteins)

            debug_message('pd_annot:', 'yellow')
            HCGB_main.print_all_pandaDF(pd_annot)

            debug_message('pd_samples_retrieved:', 'yellow')
            HCGB_main.print_all_pandaDF(pd_samples_retrieved)

    ## --------------------------------------- ##
    ## data on multiple sources
    ## --------------------------------------- ##
    elif (arg_dict.detached):
        print(colored('\t* Detached mode:.......[OK]', 'green'))

        ## parse samples provided
        print()

        #########################################################
        ## BLAST raw results provided: either batch or single
        #########################################################
        if (arg_dict.text_file):
            print(
                colored('\t* BLAST raw results provided:.......[OK]', 'green'))
            print()

            # *************************** ##
            ## Batch file provided
            # *************************** ##
            if (arg_dict.batch):
                ## debug messages
                if (arg_dict.debug):
                    debug_message('+++++++++++++++++++++++++++++++')
                    debug_message(
                        'Multiple BLAST results file provided option:',
                        'yellow')
                    debug_message('arg_dict.text_file: ' + arg_dict.text_file,
                                  'yellow')

                ## check if ok
                BacDup_functions.file_readable_check(arg_dict.text_file)

                print(
                    colored(
                        '\t* Multiple BLAST results files provided .......[OK]',
                        'green'))
                dict_entries = HCGB_main.file2dictionary(
                    arg_dict.text_file, ',')

                ## check file is readable
                BacDup_functions.file_readable_check(arg_dict.annot_table)
                dict_entries_annot = HCGB_main.file2dictionary(
                    arg_dict.annot_table, ',')

                ## Check dictionaries contain same information
                if (dict_entries.keys() == dict_entries_annot.keys()):
                    for sample, files in dict_entries.items():
                        ## check annot_table and fasta_file headers are the same ##
                        return_code = dup_searcher.check_annot_table(
                            dict_entries_annot[sample], files, 'BLAST',
                            arg_dict.debug)
                        if not (return_code):
                            print(
                                'Process will continue but sample %s would be discarded'
                                % sample)
                        else:
                            print()
                            ## fill dataframe pd_samples_retrieved

            # *************************** ##
            ## single file provided
            # *************************** ##
            else:
                ## check annot_table and fasta_file headers are the same ##
                return_code = dup_searcher.check_annot_table(
                    arg_dict.annot_table, arg_dict.text_file, 'BLAST',
                    arg_dict.debug)
                if not (return_code):
                    print('Process will stop here. Please check input files')
                    exit()
                else:
                    print()
                    ## fill dataframe pd_samples_retrieved

        #########################################################
        ## annotations file provided: either batch or single
        #########################################################
        elif (arg_dict.annot_file):
            ## debug messages
            if (arg_dict.debug):
                debug_message('Multiple BLAST results file provided option:',
                              'yellow')
                debug_message('arg_dict.annot_file: ' + arg_dict.annot_file,
                              'yellow')

            ## get input info
            df_accID = input_parser.parse_options(arg_dict)
            if (arg_dict.debug):
                debug_message('df_accID', 'yellow')
                print(df_accID)

            ## parse info
            input_parser.parse_information(arg_dict, df_accID, outdir)

            ## set missing options
            arg_dict.pair = False
            arg_dict.include_all = True
            arg_dict.include_lane = True

            ## find samples previously parsed and prepared within a BacDup project structure
            pd_proteins = sampleParser.files.get_files(arg_dict, outdir,
                                                       "parse", ["fa"],
                                                       arg_dict.debug)
            pd_annot = sampleParser.files.get_files(arg_dict, outdir, "parse",
                                                    ["annot_df.csv"],
                                                    arg_dict.debug)

            ## merge into pd_samples_retrieved
            frames = [pd_proteins, pd_annot]
            pd_samples_retrieved = pd.concat(frames, sort=True, join='outer')

            if (arg_dict.debug):
                debug_message('pd_samples_retrieved', 'yellow')
                print(pd_samples_retrieved)

        #########################################################
        ## CDS fasta and annotations provided: either batch or single
        #########################################################
        elif arg_dict.fasta_prot:

            # *************************** ##
            ## Batch file provided
            # *************************** ##
            if (arg_dict.batch):
                print(
                    colored('\t* Multiple FASTA files provided .......[OK]',
                            'green'))

                ## debug messages
                if (arg_dict.debug):
                    debug_message('+++++++++++++++++++++++++++++++')
                    debug_message(
                        'Multiple Protein FASTA files provided option:',
                        'yellow')
                    debug_message(
                        'arg_dict.fasta_prot: ' + arg_dict.fasta_prot,
                        'yellow')

                ## check if ok
                BacDup_functions.file_readable_check(arg_dict.fasta_prot)
                dict_entries = HCGB_main.file2dictionary(
                    arg_dict.fasta_prot, ',')

                ## check file is readable
                BacDup_functions.file_readable_check(arg_dict.annot_table)
                print(
                    colored(
                        '\t* Multiple annotation tables provided .......[OK]',
                        'green'))
                dict_entries_annot = HCGB_main.file2dictionary(
                    arg_dict.annot_table, ',')

                ## Check dictionaries contain right information
                if (dict_entries.keys() == dict_entries_annot.keys()):
                    for sample, files in dict_entries.items():
                        ## check annot_table and fasta_file headers are the same ##
                        return_code = dup_searcher.check_annot_table(
                            dict_entries_annot[sample], files, 'fasta',
                            arg_dict.debug)
                        if not (return_code):
                            print(
                                'Process will continue but sample %s would be discarded'
                                % sample)
                        else:
                            print()
                            ## fill dataframe pd_samples_retrieved

            # *************************** ##
            ## single file provided
            # *************************** ##
            else:
                print(
                    colored('\t* Protein FASTA file provided .......[OK]',
                            'green'))
                BacDup_functions.file_readable_check(arg_dict.fasta_prot)

                ## check file is readable
                print(
                    colored('\t* An annotation table provided .......[OK]',
                            'green'))
                BacDup_functions.file_readable_check(arg_dict.annot_table)

                ## check annot_table and fasta_file headers are the same ##
                return_code = dup_searcher.check_annot_table(
                    arg_dict.annot_table, arg_dict.fasta_prot, 'fasta',
                    arg_dict.debug)
                if not (return_code):
                    print('Process will stop here. Please check input files')
                    exit()
                else:
                    print()
                    ## fill dataframe pd_samples_retrieved
                    exit()

        ### What??
        else:
            ## Nespresso
            print()

    ## return information
    pd_samples_retrieved = pd_samples_retrieved.set_index('new_name')
    return (pd_samples_retrieved)

Beispiel #3

Datei anzeigen

def agrvate_call(sample, assembly_file, folder, debug=False):
    """agrvate call and check results."""
    
    ## prepare call
    log_call = os.path.join(folder, "agrvate_cmd.log")
    err_call = os.path.join(folder, "agrvate_cmd.err")
    agrvate_bin = set_config.get_exe('agrvate')
    
    ## system call
    cmd_call = "%s -i %s -m -f >  %s 2> %s " %(agrvate_bin, 
                                               assembly_file,
                                               log_call, err_call) ## use mummer (-m) and force results folder (-f)
    status = HCGB_sys.system_call(cmd_call)
    
    ## check results
    ## see https://github.com/VishnuRaghuram94/AgrVATE#results for additional details
    results = pd.DataFrame()
    
    ## check folder is created
    assembly_file_name = os.path.basename(assembly_file).split('.fna')[0]    
    original_results_folder = os.path.join(folder, assembly_file_name + '-results')
    results_folder = os.path.join(folder, 'agrvate_results')
    
    if os.path.isdir(original_results_folder):
        print("+ Results folder generated OK")
        print("+ Check results generated:")
        
        ## rename folder
        os.rename(original_results_folder, results_folder)
        os.rename(os.path.join(folder, assembly_file_name + '.fna-error-report.tab'), os.path.join(results_folder, 'error_report.tab'))
        
        ## write to excel
        file_name_Excel = os.path.join(folder, sample + '_agr_results.xlsx')
        writer_Excel = pd.ExcelWriter(file_name_Excel, engine='xlsxwriter') ## open excel handle
    
        ## get all files
        list_files = HCGB_main.get_fullpath_list(results_folder)
    
        ## summary tab
        summary_tab_file = [s for s in list_files if s.endswith("summary.tab")][0]
        summary_tab =  HCGB_main.get_data(summary_tab_file, '\t', options="")
        summary_tab['sample'] = sample
        
        ## columns
        #agr_group: gp1/gp2/gp3/gp4. 'u' means unknown. 
        ##           If multiple agr groups were found (col 5 = m), 
        ##           the displayed agr group is the majority/highest confidence. 
        # match_score: maximum 15; 0 means untypeable; < 5 means low confidence.
        # canonical_agrD: 1 means canonical; 0 means non-canonical; u means unknown.
        # multiple_agr:  s means single, m means multiple, u means unknown ) 
        ##               Multiple groups are found likely due to multiple S. aureus isolates in sequence
        # frameshifts: Number found in CDS of extracted agr operon ('u' if agr operon not extracted)
        
        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: Summary tab file", 'yellow')
            print(summary_tab_file)
            print(summary_tab)

        ## add summary results to all results
        del summary_tab['#filename']
        results = summary_tab.copy()

        ## save summary_tab into excel
        ## tab summary
        summary_tab.to_excel(writer_Excel, sheet_name='summary') ## write excel handle

        ## agr_gp tab
        agr_gp_tab_file = [s for s in list_files if s.endswith("agr_gp.tab")][0]
        if HCGB_files.is_non_zero_file(agr_gp_tab_file):
            agr_gp_tab =  HCGB_main.get_data(agr_gp_tab_file, '\t', options='header=None')
            agr_gp_tab.columns = ['contig', 'agr', 'evalue', 'identity', 'start', 'end']
            agr_gp_tab['sample'] = sample
            
            ## columns
            ## Assembly Contig ID
            ## ID of matched agr group kmer
            ## evalue
            ## Percentage identity of match
            ## Start position of kmer alignment on input sequence
            ## End position of kmer alignment on input sequence
    
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_gp file", 'yellow')
                print(agr_gp_tab_file)
                print(agr_gp_tab)
            
            ## save agr_gp_tab file into excel
            ## tab operon
            agr_gp_tab.to_excel(writer_Excel, sheet_name='operon') ## write excel handle

        ## agr_operon fna
        try:
            agr_operon_fna_file = [s for s in list_files if s.endswith("agr_operon.fna")][0]
            ## debug messages
            if debug:
                HCGB_aes.debug_message("agrvate results: agr_operon file", 'yellow')
                print(agr_operon_fna_file)
            
            results['operon_fna'] = agr_operon_fna_file
        except:
            results['operon_fna'] = ''

        ## agr_operon fna
        error_report_file = [s for s in list_files if s.endswith("error_report.tab")][0]
        error_report =  HCGB_main.get_data(error_report_file, '\t', options="")
        del error_report['#input_name']

        ## debug messages
        if debug:
            HCGB_aes.debug_message("agrvate results: error_report.tab file", 'yellow')
            print(error_report_file)
            print(error_report)
            
        ## save error_report file into excel
        ## tab steps
        error_report.to_excel(writer_Excel, sheet_name='steps') ## write excel handle
        
        ## merge results
        results = pd.concat([results, error_report], axis=1)

        ## close xlsx file
        writer_Excel.save() ## close excel handle
    
        ## add to pandas dataframe
        results['agr_operon_xlsx'] = file_name_Excel

    ## debug messages
    if debug:
        HCGB_aes.debug_message("agrvate results", 'yellow')
        HCGB_main.print_all_pandaDF(results)
        
    return (results)

Beispiel #4

Datei anzeigen

def run(options):

    ## init time
    start_time_total = time.time()

    ##################################
    ### show help messages if desired
    ##################################
    if (options.help_format):
        ## help_format option
        help_info.help_fastq_format()
        exit()
    elif (options.help_trimm_adapters):
        ## help on trimm adapters
        trimmomatic_call.print_help_adapters()
        exit()
    elif (options.help_project):
        ## information for project
        help_info.project_help()
        exit()
    elif (options.help_multiqc):
        ## information for Multiqc
        multiQC_report.multiqc_help()
        exit()

    ## debugging messages
    global Debug
    if (options.debug):
        Debug = True
    else:
        Debug = False

    ### set as default paired_end mode
    if (options.single_end):
        options.pair = False
    else:
        options.pair = True

    HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
    HCGB_aes.boxymcboxface("Trimming samples")
    print("--------- Starting Process ---------")
    HCGB_time.print_time()

    ## absolute path for in & out
    input_dir = os.path.abspath(options.input)
    outdir = ""

    ## Project mode as default
    if (options.detached):
        options.project = False
        outdir = os.path.abspath(options.output_folder)
    else:
        options.project = True
        outdir = input_dir

    ## get files
    pd_samples_retrieved = sampleParser.files.get_files(
        options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"),
        options.debug)

    ## debug message
    if (Debug):
        HCGB_aes.debug_message("pd_samples_retrieved", 'yellow')
        HCGB_main.print_all_pandaDF(pd_samples_retrieved)

    ## generate output folder, if necessary
    print("\n+ Create output folder(s):")
    if not options.project:
        HCGB_files.create_folder(outdir)
    ## for samples
    outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                            pd_samples_retrieved, "trimm",
                                            options.debug)

    ## optimize threads
    name_list = set(pd_samples_retrieved["name"].tolist())
    threads_job = HCGB_main.optimize_threads(
        options.threads, len(name_list))  ## threads optimization
    max_workers_int = int(options.threads / threads_job)

    ## debug message
    if (Debug):
        print(
            colored("**DEBUG: options.threads " + str(options.threads) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: max_workers " + str(max_workers_int) + " **",
                    'yellow'))
        print(
            colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow'))

    print("+ Trimming adapters for each sample retrieved...")

    # Group dataframe by sample name
    sample_frame = pd_samples_retrieved.groupby(["name"])

    # Trimming adapters
    if (options.adapters):
        # Adapter file provided
        options.adapters = os.path.abspath(options.adapters)
        print("\t- Adapters file provided...")
    else:
        # Get default adpaters file
        print("\t- Default Trimmomatic adapters (v0.39) will be used...")
        options.adapters = data_files.data_list(
            "available_Trimmomatic_adapters")

    ## send for each sample
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers_int) as executor:
        commandsSent = {
            executor.submit(trimmo_caller, sorted(cluster["sample"].tolist()),
                            outdir_dict[name], name, threads_job, Debug,
                            options.adapters): name
            for name, cluster in sample_frame
        }

        for cmd2 in concurrent.futures.as_completed(commandsSent):
            details = commandsSent[cmd2]
            try:
                data = cmd2.result()
            except Exception as exc:
                print('***ERROR:')
                print(cmd2)
                print('%r generated an exception: %s' % (details, exc))

    print("\n\n+ Trimming samples has finished...")
    ## functions.timestamp
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## get files generated and generate symbolic link
    if not options.project:
        dir_symlinks = HCGB_files.create_subfolder('link_files', outdir)
        files2symbolic = []
        folders = os.listdir(outdir)

        ## debug message
        if (Debug):
            print(
                colored(
                    "**DEBUG: generate symbolic links for each file in " +
                    dir_symlinks + "**", 'yellow'))

        for fold in folders:
            if fold.endswith(".log"):
                continue
            else:
                this_folder = outdir + '/' + fold
                subfiles = os.listdir(this_folder)
                for files in subfiles:
                    files_search = re.search(
                        r".*trim_R\d{1}.*",
                        files)  ## only paired-end. Todo: single end
                    if files_search:
                        files2symbolic.append(this_folder + '/' + files)

        HCGB_files.get_symbolic_link(files2symbolic, dir_symlinks)

    if (options.skip_report):
        print("+ No report generation...")
    else:
        print("\n+ Generating a report using MultiQC module.")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        ## call multiQC report module
        givenList = [v for v in outdir_dict.values()]
        my_outdir_list = set(givenList)

        ## debug message
        if (Debug):
            HCGB_aes.debug_message("my_outdir_list for multiqc report",
                                   "yellow")
            print(my_outdir_list)
            print("\n")

        trimm_report = HCGB_files.create_subfolder("trimm", outdir_report)
        multiQC_report.multiQC_module_call(my_outdir_list, "Trimmomatic",
                                           trimm_report, "")
        print(
            '\n+ A summary HTML report of each sample is generated in folder: %s'
            % trimm_report)

        ## create fastqc for trimmed reads
        pd_samples_retrieved_trimmed = sampleParser.files.get_files(
            options, input_dir, "trim", ['_trim'], options.debug)
        qc.fastqc(pd_samples_retrieved_trimmed, outdir, options,
                  start_time_partial, "trimmed", Debug)

    print("\n*************** Finish *******************")
    start_time_partial = HCGB_time.timestamp(start_time_total)
    print("\n+ Exiting trimm module.")
    return ()

Beispiel #5

Datei anzeigen

Datei: qc.py Projekt: HCGB-IGTP/BacterialTyper

def BUSCO_check(input_dir, outdir, options, start_time_total, mode):

    HCGB_aes.boxymcboxface("BUSCO Analysis Quality check")

    ## absolute path for in & out
    database_folder = os.path.abspath(options.database)

    ## get files and get dir for each sample according to mode
    if mode == 'genome':
        pd_samples_retrieved = sampleParser.files.get_files(
            options, input_dir, "assembly", ["fna"], options.debug)

        if not options.project:
            outdir = HCGB_files.create_subfolder("assembly_qc", outdir)

        if options.debug:
            print("** DEBUG: pd_samples_retrieved")
            print(pd_samples_retrieved)

        BUSCO_outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                                      pd_samples_retrieved,
                                                      "assemble_qc",
                                                      options.debug)

    elif mode == 'proteins':
        pd_samples_retrieved = sampleParser.files.get_files(
            options, outdir, "annot", ["faa"], options.debug)  ##

        if not options.project:
            outdir = HCGB_files.create_subfolder("annot_qc", outdir)

        if options.debug:
            print("** DEBUG: pd_samples_retrieved")
            print(pd_samples_retrieved)

        BUSCO_outdir_dict = HCGB_files.outdir_project(outdir, options.project,
                                                      pd_samples_retrieved,
                                                      "annot_qc",
                                                      options.debug)

    ## add column to dataframe
    pd_samples_retrieved['busco_folder'] = ""
    for index, row in pd_samples_retrieved.iterrows():
        pd_samples_retrieved.at[index, 'busco_folder'] = BUSCO_outdir_dict[
            row['name']]

    ## debug message
    if (options.debug):
        HCGB_aes.debug_message("df_samples_busco", 'yellow')
        print(pd_samples_retrieved)

        HCGB_aes.debug_message("BUSCO_outdir_dict", 'yellow')
        print(BUSCO_outdir_dict)

    ## Check each using BUSCO
    database_folder = os.path.abspath(options.database)
    BUSCO_Database = HCGB_files.create_subfolder('BUSCO', database_folder)
    if not os.path.exists(BUSCO_Database):
        HCGB_files.create_folder(BUSCO_Database)

    ## call
    (dataFrame_results, stats_results) = BUSCO_caller.BUSCO_call(
        options.BUSCO_dbs, pd_samples_retrieved, BUSCO_Database,
        options.threads, mode)

    ## debug message
    if (options.debug):
        HCGB_aes.debug_message("dataFrame_results", 'yellow')
        HCGB_main.print_all_pandaDF(dataFrame_results)

    ## functions.timestamp
    print("+ Quality control of all samples finished: ")
    start_time_partial = HCGB_time.timestamp(start_time_total)

    ## multiqc report plot
    if (options.skip_report):
        print("+ No report generation...")
    else:
        print("\n+ Generating a report BUSCO plot.")
        outdir_report = HCGB_files.create_subfolder("report", outdir)

        ## get subdirs generated and call multiQC report module
        givenList = []
        print(
            "+ Detail information for each sample could be identified in separate folders."
        )

        ## name folder according to mode
        if mode == 'genome':
            BUSCO_report = HCGB_files.create_subfolder("BUSCO_assembly",
                                                       outdir_report)
        elif mode == 'proteins':
            BUSCO_report = HCGB_files.create_subfolder("BUSCO_annot",
                                                       outdir_report)

        ## generate plots
        print("+ Generate summarizing plots...")
        BUSCO_caller.BUSCO_plots(dataFrame_results, BUSCO_report,
                                 options.threads)
        print('\n+ Check quality plots in folder: %s' % BUSCO_report)

        ##	TODO
        ##	Parse BUSCO statistics in dataframe (stats_results) for discarding samples if necessary
        ##	given a cutoff, discard or advise to discard some samples

        ### print statistics
        stats_results.to_csv(BUSCO_report + "/BUSCO_stats.csv")
        name_excel = BUSCO_report + "/BUSCO_stats.xlsx"
        writer = pd.ExcelWriter(name_excel, engine='xlsxwriter')
        stats_results.to_excel(writer, sheet_name="BUSCO statistics")
        writer.save()

        print('\n+ Check quality statistics in folder: %s' % BUSCO_report)

    return (dataFrame_results)

Beispiel #6

Datei anzeigen

Datei: prep.py Projekt: HCGB-IGTP/BacterialTyper

def run_prep(options):
	"""
	Main function of the prep module.
	
	This module prepares fastq files for later usage. It initially checks the length
	of the name and advises the user to rename samples if exceeded. Along ``BacterialTyper`` 
	there are a few string length limitations by different software that need to be sort
	out from the beginning of the process.
	
	This module allows to user to copy files into the project folder initiate or only link using
	a symbolic link to avoid duplicated raw data. 
	
	See additional details of this module in user_guide :ref:`prep module entry<prep-description>`. 

	
	.. seealso:: This function depends on other HCGB functions called:
	
		- :func:`HCGB.sampleParser`
		
		- :func:`HCGB.functions.aesthetics_functions`
		
		- :func:`HCGB.functions.time_functions`
	
		- :func:`HCGB.functions.main_functions`
		
		- :func:`HCGB.functions.file_functions`
		
	"""
	
	## help_format option
	if (options.help_format):
		help_info.help_fastq_format()
		exit()
		
	HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version)
	HCGB_aes.boxymcboxface("Preparing samples")
	print ("--------- Starting Process ---------")
	HCGB_time.print_time()
	
	## init time
	start_time_total = time.time()
	
	## absolute path for in & out
	input_dir = os.path.abspath(options.input)
	outdir = os.path.abspath(options.output_folder)

	### set as default paired_end mode
	if (options.single_end):
		options.pair = False
	else:
		options.pair = True

	## Project mode as default
	project_mode=True
	if (options.detached):
		options.project = False
		project_mode=False
	else:
		options.project = True

	## output folder	
	print ("\n+ Create output folder(s):")
	HCGB_files.create_folder(outdir)

	### info
	final_dir = ""
	if (options.project):
		print ("+ Generate a directory containing information within the project folder provided")
		final_dir = HCGB_files.create_subfolder("info", outdir)
	else:
		final_dir = outdir
	
	## get files
	pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug)
		
	## Information returned in pd_samples_retrieved
	### sample, dirname, name, name_len, lane, read_pair, lane_file, ext, gz
	
	if options.debug:
		HCGB_aes.debug_message("pd_samples_retrieved", "yellow")
		HCGB_main.print_all_pandaDF(pd_samples_retrieved)
	
	## time stamp
	start_time_partial = HCGB_time.timestamp(start_time_total)
	
	## check character limitation
	list_lengths = pd_samples_retrieved.loc[:,'name_len'].to_list()
	if any(i > 10 for i in list_lengths):
		print (colored("\t ** Name lengths exceeds the 10 character limitation...", 'yellow'))
		if not (options.rename):
			print (colored("** ERROR: Rename files or provide --rename option...", 'red'))
			exit()

	### rename files 
	if (options.rename):
		options.rename = os.path.abspath(options.rename)
		if not HCGB_files.is_non_zero_file(options.rename):
			print (colored("** ERROR: File provided with rename information is not readable.", 'red'))
			print (options.rename)
			exit()
		
		names_retrieved = pd.read_csv(options.rename, sep=',', 
									index_col=0, squeeze=True, 
									header=None).to_dict() ## read csv to dictionary
		if (options.debug):
			HCGB_aes.debug_message("names_retrieved", "yellow")
			print (names_retrieved)
			
		## TODO: check integrity of new names and special characters
	
		## print to a file
		timestamp = time_functions.create_human_timestamp()
		rename_details = final_dir + '/' + timestamp + '_prep_renameDetails.txt'
		rename_details_hd = open(rename_details, 'w')
	
		## rename files 		
		for index, row in pd_samples_retrieved.iterrows():
			if (row['gz']):
				extension_string = row['ext'] + row['gz']
			else:
				extension_string = row['ext']
			
			if options.single_end:
				renamed = names_retrieved[row['name']] + '.' + extension_string
			else:
				renamed = names_retrieved[row['name']] + '_' + row['read_pair'] + '.' + extension_string
			
			## modify frame
			pd_samples_retrieved.loc[index, 'new_file'] = renamed
			pd_samples_retrieved.loc[index, 'new_name'] = names_retrieved[row['name']]
			## save in file
			string = row['sample'] + '\t' + renamed + '\n'
			rename_details_hd.write(string)
			
			if (options.debug):
				print (colored('** DEBUG: rename', 'yellow'))
				print ("Original: ", row['name'])
				print ("Renamed: ", names_retrieved[row['name']])
				print ("File:", renamed)
		
		rename_details_hd.close()	

		##elif (options.single_end): It should work for both
		print ("+ Sample files have been renamed...")
	else:
		pd_samples_retrieved['new_file'] = pd_samples_retrieved['file']

	## create outdir for each sample
	outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "raw", options.debug)	
		
	## merge option
	if (options.merge):
		print ("+ Sample files will be merged...")
		## TODO: check when rename option provided
		pd_samples_merged = sampleParser.merge.one_file_per_sample(
			pd_samples_retrieved, outdir_dict, options.threads,	
			final_dir, options.debug)
		
		if (options.rename):
			print ("+ Merge files have been renamed...")
		else:
			print ("+ Sample files have been merged...")
		
		## process is finished here
		print ("\n*************** Finish *******************")
		start_time_partial = HCGB_time.timestamp(start_time_total)
	
		print ("+ Exiting prep module.")
		exit()
	
	## debugging messages
	if (options.debug):
		print (colored("** DEBUG: pd_samples_retrieved", 'yellow'))
		HCGB_main.print_all_pandaDF(pd_samples_retrieved)
		print (colored("** DEBUG: outdir_dict", 'yellow'))
		print (outdir_dict)
	
	## copy or create symbolic link for files
	if (options.copy):
		print ("+ Sample files will be copied...")
		## print to a file
		timestamp = HCGB_time.create_human_timestamp()
		copy_details = final_dir + '/' + timestamp + '_prep_copyDetails.txt'
		copy_details_hd = open(copy_details, 'w')
	else:
		print ("+ Sample files will be linked...")	
	
	list_reads = []
	for index, row in pd_samples_retrieved.iterrows():
		if (options.copy):
		    ## TODO: debug & set threads to copy faster
		    shutil.copy(row['sample'], os.path.join(outdir_dict[row['new_name']], row['new_file'] ))            
		    string = row['sample'] + '\t' + os.path.join(outdir_dict[row['new_name']], row['new_file']) + '\n'
		    copy_details_hd.write(string)            
		else:
		    list_reads.append(row['new_file'])
		    
		    if options.project:
		    	
		        HCGB_files.get_symbolic_link_file(row['sample'], 
		                                         os.path.join(outdir_dict[row['new_name']], row['new_file']))

	if (options.copy):
		print ("+ Sample files have been copied...")
		copy_details_hd.close()
	else:
		if not options.project:
			HCGB_files.get_symbolic_link(list_reads, outdir)
	
	print ("\n*************** Finish *******************")
	start_time_partial = HCGB_time.timestamp(start_time_total)

	print ("+ Exiting prep module.")
	return()