Esempio n. 1
0
def Annotation_Prodigal(args):

    input_genome_folder = args['i']
    file_extension = args['x']
    output_prefix = args['p']
    meta_mode = args['meta']
    num_threads = args['t']

    # create output folder
    output_folder_sco = '%s_prodigal_sco' % output_prefix
    output_folder_ffn = '%s_prodigal_ffn' % output_prefix
    output_folder_faa = '%s_prodigal_faa' % output_prefix
    output_folder_gbk = '%s_prodigal_gbk' % output_prefix

    force_create_folder(output_folder_sco)
    force_create_folder(output_folder_ffn)
    force_create_folder(output_folder_faa)
    force_create_folder(output_folder_gbk)

    # get input genome list
    input_genome_re = '%s/*.%s' % (input_genome_folder, file_extension)
    input_genome_file_list = [os.path.basename(file_name) for file_name in glob.glob(input_genome_re)]

    # prepare command list
    list_for_multiple_arguments_Prodigal = []
    for input_genome in input_genome_file_list:
        list_for_multiple_arguments_Prodigal.append([input_genome, input_genome_folder, meta_mode, output_folder_sco, output_folder_ffn, output_folder_faa, output_folder_gbk])

    # run prodigal with multiprocessing
    pool = mp.Pool(processes=num_threads)
    pool.map(prodigal_worker, list_for_multiple_arguments_Prodigal)
    pool.close()
    pool.join()
Esempio n. 2
0
def split_folder(args):

    file_folder = args['in']
    file_ext = args['x']
    folder_num = int(args['n'])

    # create folder
    file_folder_sep = '%s_sep' % file_folder
    force_create_folder(file_folder_sep)

    # get file list
    file_name_re = '%s/*.%s' % (file_folder, file_ext)
    file_name_list = [
        os.path.basename(file_name) for file_name in glob.glob(file_name_re)
    ]
    file_name_list = sorted(file_name_list)

    # get the number of file per folder
    file_num_per_folder = round(len(file_name_list) / folder_num)

    n = 1
    while n <= folder_num:

        # define current folder name
        current_folder_name = '%s_%s' % (file_folder, n)

        # get file list in current folder
        if n < folder_num:
            files_in_current_folder = file_name_list[(
                file_num_per_folder * (n - 1)):(file_num_per_folder * n)]
        else:
            files_in_current_folder = file_name_list[(file_num_per_folder *
                                                      (n - 1)):]

        # create folder
        pwd_current_folder = '%s/%s' % (file_folder_sep, current_folder_name)
        os.system('mkdir %s' % pwd_current_folder)

        # copy files to new folder
        for each_file in files_in_current_folder:
            os.system('cp %s/%s %s/' %
                      (file_folder, each_file, pwd_current_folder))

        n += 1
Esempio n. 3
0
def download_GenBank_genome(args):

    csv_file = args['csv']
    get_fna = args['fna']
    get_faa = args['faa']
    get_gbff = args['gbff']
    with_name = args['name']
    num_threads = args['t']

    time_format = '[%Y-%m-%d %H:%M:%S] '

    if (get_fna is False) and (get_faa is False) and (get_gbff is False):
        print(
            datetime.now().strftime(time_format) +
            'Please specify at least one file type to download, program exited'
        )
        exit()

    in_file_path, in_file_basename, in_file_extension = sep_path_basename_ext(
        csv_file)
    downloaded_genome_folder = '%s_genomes' % in_file_basename
    force_create_folder(downloaded_genome_folder)

    # report
    print(datetime.now().strftime(time_format) +
          'Downloading genomes with %s cores' % (num_threads))

    # download genome with multiprocessing
    list_for_multiple_arguments_download_worker = []
    for genome_record in open(csv_file):

        if not genome_record.startswith('#Organism Name'):
            genome_record_split = genome_record.strip().split(',')
            list_for_multiple_arguments_download_worker.append([
                genome_record_split, downloaded_genome_folder, get_fna,
                get_faa, get_gbff, with_name
            ])

    # run COG annotaion files with multiprocessing
    pool = mp.Pool(processes=num_threads)
    pool.map(genome_download_worker,
             list_for_multiple_arguments_download_worker)
    pool.close()
    pool.join()
Esempio n. 4
0
def run_blast_worker(argument_list):

    pwd_input_file = argument_list[0]
    run_blast = argument_list[1]
    run_diamond = argument_list[2]
    KEGG_DB_seq = argument_list[3]
    KEGG_DB_seq_diamond = argument_list[4]
    op_dir = argument_list[5]
    evalue_cutoff = argument_list[6]
    threads_num = argument_list[7]

    ################################################### define file name ###################################################

    input_file_path, in_file_basename, input_file_ext = sep_path_basename_ext(
        pwd_input_file)

    blast_results = '%s/%s_KEGG_wd/%s_blast.tab' % (op_dir, in_file_basename,
                                                    in_file_basename)
    blast_results_best_hit = '%s/%s_KEGG_wd/%s_blast_best_hits.tab' % (
        op_dir, in_file_basename, in_file_basename)

    # create output folder
    force_create_folder('%s/%s_KEGG_wd' % (op_dir, in_file_basename))

    ########################################## blast against KEGG database (Shan) ##########################################

    if run_blast is True:

        if run_diamond is False:
            blastp_cmd = 'blastp -query %s -db %s -out %s -outfmt 6 -evalue %s -num_alignments 10 -num_threads %s' % (
                pwd_input_file, KEGG_DB_seq, blast_results, evalue_cutoff,
                threads_num)
            os.system(blastp_cmd)

        else:
            diamond_cmd = 'diamond blastp -q %s --db %s --out %s --outfmt 6 --evalue %s --block-size 1 --threads %s --quiet' % (
                pwd_input_file, KEGG_DB_seq_diamond, blast_results,
                evalue_cutoff, threads_num)
            os.system(diamond_cmd)

        # only keep the best hit
        keep_blast_hit_with_highest_bit_score(blast_results,
                                              blast_results_best_hit)
Esempio n. 5
0
def Annotation_KEGG(args):

    input_file_faa = args['seq_in']
    input_file_user_ko = args['ko_in']
    file_extension = args['x']
    depth_file = args['depth']
    pct_by_all = args['pct_by_all']
    KEGG_DB_folder = args['db_dir']
    run_diamond = args['diamond']
    num_threads = args['t']
    evalue_cutoff = args['evalue']

    run_blast = None
    if (input_file_faa is not None) and (input_file_user_ko is None):
        run_blast = True
    elif (input_file_faa is None) and (input_file_user_ko is not None):
        run_blast = False
    else:
        print(
            datetime.now().strftime(time_format) +
            'Please provide input file with either "-seq_in" or "-ko_in", do not provide both'
        )
        exit()

    if run_blast is True:
        input_file_folder = input_file_faa
    else:
        input_file_folder = input_file_user_ko

    # check whether input file/folder exist
    if (os.path.isfile(input_file_folder) is
            False) and (os.path.isdir(input_file_folder) is False):
        print(datetime.now().strftime(time_format) +
              'input file/folder not found, program exited')
        exit()

    if run_blast is True:
        print(datetime.now().strftime(time_format) +
              'Input sequence file detected, will run blastp/diamond first')
        sleep(0.5)
    else:
        print(datetime.now().strftime(time_format) +
              'Annotation results provided, blastp/diamond skipped')
        sleep(0.5)

    ################################################# define file name #################################################

    KEGG_DB_seq = '%s/kegg_db_seq.fasta' % KEGG_DB_folder
    KEGG_DB_seq_diamond = '%s/kegg_db_seq.fasta.dmnd' % KEGG_DB_folder
    KEGG_DB_seq2ko = '%s/kegg_db_seq2ko.txt' % KEGG_DB_folder
    KEGG_DB_ko = '%s/ko00001.keg' % KEGG_DB_folder

    ########################################## check whether diamond db exist ##########################################

    if (run_blast is True) and (run_diamond is True):
        if os.path.isfile(KEGG_DB_seq_diamond) is False:
            print(datetime.now().strftime(time_format) +
                  'DB file not found, making diamond db with %s' % KEGG_DB_seq)

            if os.path.isfile(KEGG_DB_seq) is True:
                diamond_makedb_cmd = 'diamond makedb --in %s --db %s --quiet' % (
                    KEGG_DB_seq, KEGG_DB_seq_diamond)
                os.system(diamond_makedb_cmd)
            else:
                print(datetime.now().strftime(time_format) +
                      '%s not found, program exited' % KEGG_DB_seq)
                exit()

    ########################################### check whether blast+ db exist ##########################################

    if (run_blast is True) and (run_diamond is False):

        unfound_db_index_file = []
        for db_index in [
                'phr', 'pin', 'pnd', 'pni', 'pog', 'psd', 'psi', 'psq'
        ]:
            pwd_db_index = '%s/kegg_db_seq.fasta.%s' % (KEGG_DB_folder,
                                                        db_index)
            if not os.path.isfile(pwd_db_index):
                unfound_db_index_file.append(db_index)
        if len(unfound_db_index_file) > 0:
            print(datetime.now().strftime(time_format) +
                  'blast db index not found, runing makeblastdb first')
            makeblastdb_cmd = 'makeblastdb -in %s -dbtype prot -parse_seqids -logfile %s.log' % (
                KEGG_DB_seq, KEGG_DB_seq)
            os.system(makeblastdb_cmd)
            print(datetime.now().strftime(time_format) +
                  'makeblastdb finished')

    ######################################### Run blastp with multiprocessing ##########################################

    # check whether the input file is a file or folder
    if os.path.isfile(input_file_folder) is True:
        input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext(
            input_file_folder)
        run_blast_worker([
            input_file_folder, run_blast, run_diamond, KEGG_DB_seq,
            KEGG_DB_seq_diamond, input_file_path, evalue_cutoff, num_threads
        ])

    if os.path.isdir(input_file_folder) is True:

        # create output folder
        output_folder = '%s_KEGG_wd' % input_file_folder
        force_create_folder(output_folder)

        # check whether input genome exist
        input_file_re = '%s/*.%s' % (input_file_folder, file_extension)
        input_file_name_list = [
            os.path.basename(file_name)
            for file_name in glob.glob(input_file_re)
        ]

        if len(input_file_name_list) == 0:
            print(datetime.now().strftime(time_format) +
                  'input file not found, program exited')
            exit()

        # run blastp with multiprocessing
        if run_blast is True:
            print(datetime.now().strftime(time_format) +
                  'Running Blast/Diamond for %s input files with %s cores' %
                  (len(input_file_name_list), num_threads))

        list_for_multiple_arguments_blast = []
        for input_file in input_file_name_list:
            pwd_input_file = '%s/%s' % (input_file_folder, input_file)
            list_for_multiple_arguments_blast.append([
                pwd_input_file, run_blast, run_diamond, KEGG_DB_seq,
                KEGG_DB_seq_diamond, output_folder, evalue_cutoff, 1
            ])

        # run blastp with multiprocessing
        pool = mp.Pool(processes=num_threads)
        pool.map(run_blast_worker, list_for_multiple_arguments_blast)
        pool.close()
        pool.join()

    ############################################## Read in KEGG DB files ###############################################

    print(datetime.now().strftime(time_format) + 'Read in KEGG DB files')

    As_description_dict = {}
    Bs_description_dict = {}
    Cs_description_dict = {}
    Ds_description_dict = {}
    D2ABCD_dict = {}
    current_A = ''
    current_B = ''
    current_C = ''
    for each_line in open(KEGG_DB_ko):
        if each_line[0] in ['A', 'B', 'C', 'D']:
            each_line_split = each_line.strip().split(' ')

            if each_line[0] == 'A':
                current_A_id = each_line_split[0]
                current_A_description = ' '.join(each_line_split[1:])
                current_A = current_A_id
                As_description_dict[current_A_id] = current_A_description

            elif each_line[0] == 'B':
                if len(each_line_split) > 1:
                    current_B_id = each_line_split[2]
                    current_B_description = ' '.join(each_line_split[3:])
                    current_B = current_B_id
                    Bs_description_dict[current_B_id] = current_B_description

            elif each_line[0] == 'C':
                current_C_id = each_line_split[4]
                current_C_description = ' '.join(each_line_split[5:])
                current_C = current_C_id
                Cs_description_dict[current_C_id] = current_C_description

            elif each_line[0] == 'D':
                current_D_id = each_line_split[6]
                current_D_description = ' '.join(each_line_split[7:])
                Ds_description_dict[current_D_id] = current_D_description
                ABCD_value = 'A_%s|B_%s|C_%s|D_%s' % (current_A, current_B,
                                                      current_C, current_D_id)
                if current_D_id not in D2ABCD_dict:
                    D2ABCD_dict[current_D_id] = [ABCD_value]
                elif (current_D_id
                      in D2ABCD_dict) and (ABCD_value
                                           not in D2ABCD_dict[current_D_id]):
                    D2ABCD_dict[current_D_id].append(ABCD_value)

    # get db_seq_to_KO_dict
    db_seq_to_KO_dict = {}
    if run_blast is True:
        for each_hit in open(KEGG_DB_seq2ko):
            each_hit_split = each_hit.strip().split('\t')
            db_seq = each_hit_split[0]
            hit_id_KO = each_hit_split[1]
            if hit_id_KO != '':
                db_seq_to_KO_dict[db_seq] = hit_id_KO

    ########################################################################################################################

    # check whether the input file is a file or folder
    if os.path.isfile(input_file_folder) is True:

        # check whether depth file exist
        if depth_file is not None:
            if os.path.isfile(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth file not found, program exited!')
                exit()

        print(datetime.now().strftime(time_format) +
              'Running KEGG annotation for 1 file with %s cores' %
              (num_threads))
        input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext(
            input_file_folder)
        parse_blast_op_worker([
            input_file_folder, run_blast, As_description_dict,
            Bs_description_dict, Cs_description_dict, Ds_description_dict,
            D2ABCD_dict, db_seq_to_KO_dict, input_file_path, depth_file,
            pct_by_all
        ])

    if os.path.isdir(input_file_folder) is True:

        input_file_re = '%s/*.%s' % (input_file_folder, file_extension)
        input_file_name_list = [
            os.path.basename(file_name)
            for file_name in glob.glob(input_file_re)
        ]

        # check whether depth file exist
        if depth_file is not None:

            if os.path.isfile(depth_file) is True:
                print(
                    datetime.now().strftime(time_format) +
                    'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.'
                )
                print(
                    datetime.now().strftime(time_format) +
                    'single depth file (not folder) detected, program exited!')
                exit()

            if os.path.isdir(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth folder not found, program exited!')
                exit()

            if os.path.isdir(depth_file) is True:

                undetected_depth_file = []
                for input_seq_file in input_file_name_list:
                    input_seq_file_basename = '.'.join(
                        input_seq_file.split('.')[:-1])
                    input_seq_file_depth = '%s/%s.depth' % (
                        depth_file, input_seq_file_basename)
                    if os.path.isfile(input_seq_file_depth) is False:
                        undetected_depth_file.append(input_seq_file_depth)

                if len(undetected_depth_file) > 0:
                    print(
                        datetime.now().strftime(time_format) +
                        'the following depth files not found, program exited!')
                    print(','.join(undetected_depth_file))
                    exit()

        # create output folder
        output_folder = '%s_KEGG_wd' % input_file_folder
        input_folder_name = input_file_folder
        if '/' in input_file_folder:
            input_folder_name = input_file_folder.split('/')[-1]

        # parse blast results with multiprocessing
        if run_blast is True:
            print(
                datetime.now().strftime(time_format) +
                'Parsing Blast/Diamond results for %s input files with %s cores'
                % (len(input_file_name_list), num_threads))

        list_for_multiple_arguments_parse_blast_op = []
        for input_file in input_file_name_list:

            input_file_basename = '.'.join(input_file.split('.')[:-1])
            pwd_input_file = '%s/%s' % (input_file_folder, input_file)

            # get path to current depth file
            if depth_file is None:
                input_file_depth = None
            else:
                input_file_depth = '%s/%s.depth' % (depth_file,
                                                    input_file_basename)

            list_for_multiple_arguments_parse_blast_op.append([
                pwd_input_file, run_blast, As_description_dict,
                Bs_description_dict, Cs_description_dict, Ds_description_dict,
                D2ABCD_dict, db_seq_to_KO_dict, output_folder,
                input_file_depth, pct_by_all
            ])

        # parse blast results with multiprocessing
        pool = mp.Pool(processes=num_threads)
        pool.map(parse_blast_op_worker,
                 list_for_multiple_arguments_parse_blast_op)
        pool.close()
        pool.join()

        ######################################################### get dataframe #########################################################

        print(datetime.now().strftime(time_format) +
              'Data matrix exported to:')

        for ko_level in ['A', 'B', 'C', 'D']:
            annotation_df_GeneNumber = '%s/%s_%s_GeneNumber.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_GeneNumber_pct = '%s/%s_%s_GeneNumber_pct.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_GeneNumber_pct_by_all = '%s/%s_%s_GeneNumber_pct_by_all.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_TotalDepth = '%s/%s_%s_TotalDepth.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_TotalDepth_pct = '%s/%s_%s_TotalDepth_pct.txt' % (
                output_folder, input_folder_name, ko_level)
            annotation_df_TotalDepth_pct_by_all = '%s/%s_%s_TotalDepth_pct_by_all.txt' % (
                output_folder, input_folder_name, ko_level)

            #################### get GeneNumber df and report ####################

            get_KEGG_annot_df(output_folder,
                              ko_level,
                              annotation_df_GeneNumber,
                              annotation_df_GeneNumber_pct,
                              annotation_df_GeneNumber_pct_by_all,
                              with_depth=False,
                              pct_by_all=pct_by_all)

            print(annotation_df_GeneNumber.split('/')[-1])
            print(annotation_df_GeneNumber_pct.split('/')[-1])
            if pct_by_all is True:
                print(annotation_df_GeneNumber_pct_by_all.split('/')[-1])

            #################### get TotalDepth df and report ####################

            if depth_file is not None:
                get_KEGG_annot_df(output_folder,
                                  ko_level,
                                  annotation_df_TotalDepth,
                                  annotation_df_TotalDepth_pct,
                                  annotation_df_TotalDepth_pct_by_all,
                                  with_depth=True,
                                  pct_by_all=pct_by_all)

                print(annotation_df_TotalDepth.split('/')[-1])
                print(annotation_df_TotalDepth_pct.split('/')[-1])
                if pct_by_all is True:
                    print(annotation_df_TotalDepth_pct_by_all.split('/')[-1])

    ################################################## Final report ####################################################

    print(datetime.now().strftime(time_format) + 'Done!')
Esempio n. 6
0
def COG2020(args):

    file_in = args['i']
    file_extension = args['x']
    sequence_type = args['m']
    depth_file = args['depth']
    pct_by_all = args['pct_by_all']
    DB_dir = args['db_dir']
    num_threads = args['t']
    run_diamond = args['diamond']
    evalue_cutoff = args['evalue']

    pwd_cog_20_fa = '%s/cog-20.fa' % DB_dir
    pwd_cog_20_fa_diamond = '%s/cog-20.fa.dmnd' % DB_dir
    pwd_cog_20_cog_csv = '%s/cog-20.cog.csv' % DB_dir
    pwd_cog_20_def_tab = '%s/cog-20.def.tab' % DB_dir
    pwd_fun_20_tab = '%s/fun-20.tab' % DB_dir

    ############################################ check whether db file exist ###########################################

    # check whether db file exist
    unfound_inputs = []
    for each_input in [pwd_cog_20_fa, pwd_cog_20_def_tab, pwd_fun_20_tab]:
        if (not os.path.isfile(each_input)) and (
                not os.path.isdir(each_input)):
            unfound_inputs.append(each_input)
    if len(unfound_inputs) > 0:
        for each_unfound in unfound_inputs:
            print('%s not found' % each_unfound)
        exit()

    if run_diamond is True:
        if os.path.isfile(pwd_cog_20_fa_diamond) is False:
            print(
                datetime.now().strftime(time_format) +
                'DB file for diamond not found, please refers to the help info for diamond db preparation'
            )
            print(datetime.now().strftime(time_format) + 'Program exited!')
            exit()

    ################################################# read db into dict ################################################

    # get protein_to_cog_dict (cog-20.cog.csv)
    protein_to_cog_dict = {}
    for each_line in open(pwd_cog_20_cog_csv):
        each_line_split = each_line.strip().split(',')
        protein_id = each_line_split[2]
        protein_id_no_dot = '_'.join(protein_id.split('.'))
        cog_id = each_line_split[6]
        if protein_id_no_dot not in protein_to_cog_dict:
            protein_to_cog_dict[protein_id_no_dot] = {cog_id}
        else:
            protein_to_cog_dict[protein_id_no_dot].add(cog_id)

    # get cog_id_to_category_dict and cog_id_to_description_dict (cognames2003-2014.tab)
    cog_id_to_category_dict = {}
    cog_id_to_description_dict = {}
    for cog_id_to_cate_des in open(pwd_cog_20_def_tab,
                                   encoding='windows-1252'):
        if not cog_id_to_cate_des.startswith('#'):
            cog_id_to_cate_des_split = cog_id_to_cate_des.strip().split('\t')
            cog_id = cog_id_to_cate_des_split[0]
            cog_cate = cog_id_to_cate_des_split[1]
            cog_des = cog_id_to_cate_des_split[2]
            cog_id_to_category_dict[cog_id] = cog_cate
            cog_id_to_description_dict[cog_id] = cog_des

    # get cog_category_to_description_dict (fun2003-2014.tab)
    cog_category_list = []
    cog_category_to_description_dict = {}
    for cog_category in open(pwd_fun_20_tab):
        if not cog_category.startswith('#'):
            cog_category_split = cog_category.strip().split('\t')
            cog_category_list.append(cog_category_split[0])
            cog_category_to_description_dict[
                cog_category_split[0]] = cog_category_split[1]

    ################################################## if input is file ################################################

    # if input is file
    if os.path.isfile(file_in) is True:

        # check whether depth file exist
        if depth_file is not None:
            if os.path.isfile(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth file not found, program exited!')
                exit()

        print(datetime.now().strftime(time_format) +
              'Running COG annotation for 1 file with %s cores' %
              (num_threads))

        file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext(
            file_in)

        COG2020_worker([
            file_in, pwd_cog_20_fa, protein_to_cog_dict,
            cog_id_to_category_dict, cog_id_to_description_dict,
            cog_category_list, cog_category_to_description_dict, sequence_type,
            file_in_path, num_threads, run_diamond, evalue_cutoff, depth_file,
            pct_by_all
        ])

    ################################################ if input is folder ################################################

    # if input is folder
    else:

        # check whether input folder exist
        if os.path.isdir(file_in) is False:
            print(datetime.now().strftime(time_format) +
                  'input folder not found, program exited')
            exit()

        else:
            # check whether input genome exist
            input_file_re = '%s/*.%s' % (file_in, file_extension)
            input_file_name_list = [
                os.path.basename(file_name)
                for file_name in glob.glob(input_file_re)
            ]

            if len(input_file_name_list) == 0:
                print(datetime.now().strftime(time_format) +
                      'input file not found, program exited')
                exit()

            # check whether depth file exist
            if depth_file is not None:

                if os.path.isfile(depth_file) is True:
                    print(
                        datetime.now().strftime(time_format) +
                        'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.'
                    )
                    print(
                        datetime.now().strftime(time_format) +
                        'a single file (not folder) detected, program exited!')
                    exit()

                if os.path.isdir(depth_file) is False:
                    print(datetime.now().strftime(time_format) +
                          'provided depth folder not found, program exited!')
                    exit()

                if os.path.isdir(depth_file) is True:

                    undetected_depth_file = []
                    for input_seq_file in input_file_name_list:
                        input_seq_file_basename = '.'.join(
                            input_seq_file.split('.')[:-1])
                        input_seq_file_depth = '%s/%s.depth' % (
                            depth_file, input_seq_file_basename)
                        if os.path.isfile(input_seq_file_depth) is False:
                            undetected_depth_file.append(input_seq_file_depth)

                    if len(undetected_depth_file) > 0:
                        print(
                            datetime.now().strftime(time_format) +
                            'the following depth files not found, program exited!'
                        )
                        print(','.join(undetected_depth_file))
                        exit()

            ################################################### define file name ###################################################

            if '/' in file_in:
                file_in_folder_name = file_in.split('/')[-1]
            else:
                file_in_folder_name = file_in

            output_folder = '%s_COG2020_wd' % file_in_folder_name

            # create output folder
            force_create_folder(output_folder)

            ######################################################### main #########################################################

            print(datetime.now().strftime(time_format) +
                  'Running COG annotation for %s files with %s cores' %
                  (len(input_file_name_list), num_threads))

            list_for_multiple_arguments_COG = []
            for input_file in input_file_name_list:

                input_file_basename = '.'.join(input_file.split('.')[:-1])
                pwd_input_file = '%s/%s' % (file_in, input_file)

                # get path to current depth file
                if depth_file is None:
                    input_file_depth = None
                else:
                    input_file_depth = '%s/%s.depth' % (depth_file,
                                                        input_file_basename)

                list_for_multiple_arguments_COG.append([
                    pwd_input_file, pwd_cog_20_fa, protein_to_cog_dict,
                    cog_id_to_category_dict, cog_id_to_description_dict,
                    cog_category_list, cog_category_to_description_dict,
                    sequence_type, output_folder, 1, run_diamond,
                    evalue_cutoff, input_file_depth, pct_by_all
                ])

            # run COG annotaion files with multiprocessing
            pool = mp.Pool(processes=num_threads)
            pool.map(COG2020_worker, list_for_multiple_arguments_COG)
            pool.close()
            pool.join()

            ######################################################### get dataframe #########################################################

            annotation_df_cog_cate_GeneNumber = '%s/%s_COG2020_cate_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_GeneNumber_pct = '%s/%s_COG2020_cate_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_GeneNumber_pct_by_all = '%s/%s_COG2020_cate_GeneNumber_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_cate_TotalDepth = '%s/%s_COG2020_cate_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_TotalDepth_pct = '%s/%s_COG2020_cate_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_TotalDepth_pct_by_all = '%s/%s_COG2020_cate_TotalDepth_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_id_GeneNumber = '%s/%s_COG2020_id_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_GeneNumber_pct = '%s/%s_COG2020_id_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_GeneNumber_pct_by_all = '%s/%s_COG2020_id_GeneNumber_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_id_TotalDepth = '%s/%s_COG2020_id_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_TotalDepth_pct = '%s/%s_COG2020_id_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_TotalDepth_pct_by_all = '%s/%s_COG2020_id_TotalDepth_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            print(datetime.now().strftime(time_format) +
                  'Data matrix exported to:')

            # get df
            get_COG_annot_df(output_folder,
                             'cog_cate',
                             annotation_df_cog_cate_GeneNumber,
                             annotation_df_cog_cate_GeneNumber_pct,
                             annotation_df_cog_cate_GeneNumber_pct_by_all,
                             with_depth=False,
                             pct_by_all=False)
            get_COG_annot_df(output_folder,
                             'cog_id',
                             annotation_df_cog_id_GeneNumber,
                             annotation_df_cog_id_GeneNumber_pct,
                             annotation_df_cog_id_GeneNumber_pct_by_all,
                             with_depth=False,
                             pct_by_all=False)
            if pct_by_all is True:
                get_COG_annot_df(output_folder,
                                 'cog_cate',
                                 annotation_df_cog_cate_GeneNumber,
                                 annotation_df_cog_cate_GeneNumber_pct,
                                 annotation_df_cog_cate_GeneNumber_pct_by_all,
                                 with_depth=False,
                                 pct_by_all=True)
                get_COG_annot_df(output_folder,
                                 'cog_id',
                                 annotation_df_cog_id_GeneNumber,
                                 annotation_df_cog_id_GeneNumber_pct,
                                 annotation_df_cog_id_GeneNumber_pct_by_all,
                                 with_depth=False,
                                 pct_by_all=True)

            # report
            if pct_by_all is False:
                print(datetime.now().strftime(time_format) + '%s and %s' %
                      (annotation_df_cog_id_GeneNumber.split('/')[-1],
                       annotation_df_cog_id_GeneNumber_pct.split('/')[-1]))
                print(datetime.now().strftime(time_format) + '%s and %s' %
                      (annotation_df_cog_cate_GeneNumber.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct.split('/')[-1]))
            else:
                print(
                    datetime.now().strftime(time_format) + '%s, %s and %s' %
                    (annotation_df_cog_id_GeneNumber.split('/')[-1],
                     annotation_df_cog_id_GeneNumber_pct.split('/')[-1],
                     annotation_df_cog_id_GeneNumber_pct_by_all.split('/')[-1])
                )
                print(datetime.now().strftime(time_format) + '%s, %s and %s' %
                      (annotation_df_cog_cate_GeneNumber.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct_by_all.split('/')
                       [-1]))

            if depth_file is not None:
                get_COG_annot_df(output_folder,
                                 'cog_cate',
                                 annotation_df_cog_cate_TotalDepth,
                                 annotation_df_cog_cate_TotalDepth_pct,
                                 annotation_df_cog_cate_TotalDepth_pct_by_all,
                                 with_depth=True,
                                 pct_by_all=False)
                get_COG_annot_df(output_folder,
                                 'cog_id',
                                 annotation_df_cog_id_TotalDepth,
                                 annotation_df_cog_id_TotalDepth_pct,
                                 annotation_df_cog_id_TotalDepth_pct_by_all,
                                 with_depth=True,
                                 pct_by_all=False)
                if pct_by_all is True:
                    get_COG_annot_df(
                        output_folder,
                        'cog_cate',
                        annotation_df_cog_cate_TotalDepth,
                        annotation_df_cog_cate_TotalDepth_pct,
                        annotation_df_cog_cate_TotalDepth_pct_by_all,
                        with_depth=True,
                        pct_by_all=True)
                    get_COG_annot_df(
                        output_folder,
                        'cog_id',
                        annotation_df_cog_id_TotalDepth,
                        annotation_df_cog_id_TotalDepth_pct,
                        annotation_df_cog_id_TotalDepth_pct_by_all,
                        with_depth=True,
                        pct_by_all=True)

                # report
                if pct_by_all is False:
                    print(datetime.now().strftime(time_format) + '%s and %s' %
                          (annotation_df_cog_id_TotalDepth.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct.split('/')[-1]))
                    print(
                        datetime.now().strftime(time_format) + '%s and %s' %
                        (annotation_df_cog_cate_TotalDepth.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct.split('/')[-1]))
                else:
                    print(datetime.now().strftime(time_format) +
                          '%s, %s and %s' %
                          (annotation_df_cog_id_TotalDepth.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct_by_all.split(
                               '/')[-1]))
                    print(
                        datetime.now().strftime(time_format) +
                        '%s, %s and %s' %
                        (annotation_df_cog_cate_TotalDepth.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct_by_all.split(
                             '/')[-1]))

    ################################################## Final report ####################################################

    print(datetime.now().strftime(time_format) + 'Done!')
Esempio n. 7
0
def COG2020_worker(argument_list):

    pwd_input_file = argument_list[0]
    pwd_prot2003_2014 = argument_list[1]
    protein_id_to_cog_id_dict = argument_list[2]
    cog_id_to_category_dict = argument_list[3]
    cog_id_to_description_dict = argument_list[4]
    cog_category_list = argument_list[5]
    cog_category_to_description_dict = argument_list[6]
    sequence_type = argument_list[7]
    output_folder = argument_list[8]
    thread_num = argument_list[9]
    run_diamond = argument_list[10]
    evalue_cutoff = argument_list[11]
    depth_file = argument_list[12]
    pct_by_all = argument_list[13]

    input_seq_no_path, input_seq_no_ext, input_seq_ext = sep_path_basename_ext(
        pwd_input_file)
    current_output_folder = '%s/%s_COG2020_wd' % (output_folder,
                                                  input_seq_no_ext)

    pwd_blastp_output = '%s/%s_blastp.tab' % (current_output_folder,
                                              input_seq_no_ext)
    pwd_blastp_output_besthits = '%s/%s_blastp_besthits.tab' % (
        current_output_folder, input_seq_no_ext)
    pwd_query_to_cog_txt = '%s/%s_query_to_cog.txt' % (current_output_folder,
                                                       input_seq_no_ext)

    pwd_cog_stats_GeneNumber = '%s/%s_cog_stats_GeneNumber.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_cog_stats_TotalDepth = '%s/%s_cog_stats_TotalDepth.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_GeneNumber = '%s/%s_func_stats_GeneNumber.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_TotalDepth = '%s/%s_func_stats_TotalDepth.txt' % (
        current_output_folder, input_seq_no_ext)

    pwd_cog_stats_GeneNumber_pct = '%s/%s_cog_stats_GeneNumber_pct.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_cog_stats_TotalDepth_pct = '%s/%s_cog_stats_TotalDepth_pct.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_GeneNumber_pct = '%s/%s_func_stats_GeneNumber_pct.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_TotalDepth_pct = '%s/%s_func_stats_TotalDepth_pct.txt' % (
        current_output_folder, input_seq_no_ext)

    pwd_cog_stats_GeneNumber_pct_by_all = '%s/%s_cog_stats_GeneNumber_pct_by_all.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_cog_stats_TotalDepth_pct_by_all = '%s/%s_cog_stats_TotalDepth_pct_by_all.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_GeneNumber_pct_by_all = '%s/%s_func_stats_GeneNumber_pct_by_all.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_func_stats_TotalDepth_pct_by_all = '%s/%s_func_stats_TotalDepth_pct_by_all.txt' % (
        current_output_folder, input_seq_no_ext)

    force_create_folder(current_output_folder)

    input_seq_aa = ''
    if sequence_type in ['N', 'n']:
        input_seq_aa = '%s_aa.fasta' % input_seq_no_ext
        dna2aa(pwd_input_file, input_seq_aa)
    elif sequence_type in ['P', 'p']:
        input_seq_aa = pwd_input_file
    else:
        print('Specified input sequence type unrecognizable, program exited!')
        exit()

    # run blastp
    if run_diamond is False:
        os.system(
            'blastp -query %s -db %s -out %s -evalue %s -outfmt 6 -show_gis -num_threads %s'
            % (input_seq_aa, pwd_prot2003_2014, pwd_blastp_output,
               evalue_cutoff, thread_num))
    else:
        os.system(
            'diamond blastp -q %s --db %s.dmnd --out %s --evalue %s --outfmt 6 --threads %s --quiet'
            % (input_seq_aa, pwd_prot2003_2014, pwd_blastp_output,
               evalue_cutoff, thread_num))

    # keep only best hits
    best_hit({'i': pwd_blastp_output, 'o': pwd_blastp_output_besthits})

    # get query_to_ref_protein_dict
    query_to_ref_protein_dict = {}
    for each_hit in open(pwd_blastp_output_besthits):
        each_hit_split = each_hit.strip().split('\t')
        each_hit_query = each_hit_split[0]
        each_hit_subject = each_hit_split[1]
        each_hit_subject_no_dot = '_'.join(each_hit_subject.split('.'))
        query_to_ref_protein_dict[each_hit_query] = each_hit_subject_no_dot

    # get query sequences list
    query_seq_list = []
    for query_seq in SeqIO.parse(pwd_input_file, 'fasta'):
        query_seq_list.append(query_seq.id)

    # export annotation
    cog_id_num_dict = {}
    cog_id_to_gene_member_dict = {}
    cog_cate_num_dict = {}
    cog_cate_to_gene_member_dict = {}
    genes_with_cog = set()
    pwd_query_to_cog_txt_handle = open(pwd_query_to_cog_txt, 'w')
    pwd_query_to_cog_txt_handle.write('Query\tCOG\tCategory\tDescription\n')
    for query_gene in sorted(query_seq_list):

        if query_gene not in query_to_ref_protein_dict:
            pwd_query_to_cog_txt_handle.write('%s\n' % (query_gene))

        else:
            db_protein_id = query_to_ref_protein_dict[query_gene]
            if db_protein_id not in protein_id_to_cog_id_dict:
                pwd_query_to_cog_txt_handle.write('%s\n' % (query_gene))

            else:
                cog_id_list = protein_id_to_cog_id_dict[db_protein_id]
                for cog_id in cog_id_list:
                    cog_cate = cog_id_to_category_dict[cog_id]
                    cog_des = cog_id_to_description_dict[cog_id]
                    pwd_query_to_cog_txt_handle.write(
                        '%s\t%s\t%s\t%s\n' %
                        (query_gene, cog_id, cog_cate, cog_des))
                    genes_with_cog.add(query_gene)

                    # update cog_id_num_dict
                    if cog_id not in cog_id_num_dict:
                        cog_id_num_dict[cog_id] = 1
                        cog_id_to_gene_member_dict[cog_id] = [query_gene]
                    else:
                        cog_id_num_dict[cog_id] += 1
                        cog_id_to_gene_member_dict[cog_id].append(query_gene)

                    # update cog_cate_num_dict
                    for each_cog_cate in cog_cate:
                        if each_cog_cate not in cog_cate_num_dict:
                            cog_cate_num_dict[each_cog_cate] = 1
                            cog_cate_to_gene_member_dict[each_cog_cate] = [
                                query_gene
                            ]
                        else:
                            cog_cate_num_dict[each_cog_cate] += 1
                            cog_cate_to_gene_member_dict[each_cog_cate].append(
                                query_gene)

    pwd_query_to_cog_txt_handle.close()

    # read in depth info
    gene_depth_dict = {}
    if depth_file is not None:
        for each_depth in open(depth_file):
            each_depth_split = each_depth.strip().split('\t')
            gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1])

    # get TotalDepth of all query genes or genes with cog assignment
    if depth_file is not None:
        genes_with_cog_TotalDepth = get_gene_list_TotalDepth(
            genes_with_cog, gene_depth_dict)
        total_depth_for_all_query_genes = get_gene_list_TotalDepth(
            query_seq_list, gene_depth_dict)

    #################### export cog_stats_GeneNumber ####################

    pwd_cog_stats_GeneNumber_handle = open(pwd_cog_stats_GeneNumber, 'w')
    pwd_cog_stats_GeneNumber_handle.write('COG\tGeneNumber\tDescription\n')
    for each_cog_id in cog_id_num_dict:
        each_cog_id_GeneNumber = cog_id_num_dict[each_cog_id]
        pwd_cog_stats_GeneNumber_handle.write(
            '%s\t%s\t%s\n' % (each_cog_id, each_cog_id_GeneNumber,
                              cog_id_to_description_dict[each_cog_id]))
    pwd_cog_stats_GeneNumber_handle.close()

    #################### export cog_stats_TotalDepth ####################

    if depth_file is not None:
        pwd_cog_stats_TotalDepth_handle = open(pwd_cog_stats_TotalDepth, 'w')
        pwd_cog_stats_TotalDepth_handle.write('COG\tTotalDepth\tDescription\n')
        for each_cog_id in cog_id_to_gene_member_dict:
            each_cog_id_gene_member = cog_id_to_gene_member_dict[each_cog_id]
            each_cog_id_TotalDepth = 0
            for each_gene in each_cog_id_gene_member:
                each_gene_depth = gene_depth_dict[each_gene]
                each_cog_id_TotalDepth += each_gene_depth
            each_cog_id_TotalDepth = float(
                "{0:.2f}".format(each_cog_id_TotalDepth))
            pwd_cog_stats_TotalDepth_handle.write(
                '%s\t%s\t%s\n' % (each_cog_id, each_cog_id_TotalDepth,
                                  cog_id_to_description_dict[each_cog_id]))
        pwd_cog_stats_TotalDepth_handle.close()

    #################### export func_stats_GeneNumber ####################

    pwd_func_stats_GeneNumber_handle = open(pwd_func_stats_GeneNumber, 'w')
    pwd_func_stats_GeneNumber_handle.write(
        'Category\tGeneNumber\tDescription\n')
    for each_cog_cate in cog_category_list:
        each_cog_cate_GeneNumber = 0
        if each_cog_cate in cog_cate_num_dict:
            each_cog_cate_GeneNumber = cog_cate_num_dict[each_cog_cate]
        pwd_func_stats_GeneNumber_handle.write(
            '%s\t%s\t%s\n' % (each_cog_cate, each_cog_cate_GeneNumber,
                              cog_category_to_description_dict[each_cog_cate]))
    pwd_func_stats_GeneNumber_handle.close()

    #################### export func_stats_TotalDepth ####################

    if depth_file is not None:
        pwd_func_stats_TotalDepth_handle = open(pwd_func_stats_TotalDepth, 'w')
        pwd_func_stats_TotalDepth_handle.write(
            'Category\tTotalDepth\tDescription\n')
        for each_cog_cate in cog_category_list:
            each_cog_cate_TotalDepth = 0
            if each_cog_cate in cog_cate_to_gene_member_dict:
                each_cog_cate_gene_member = cog_cate_to_gene_member_dict[
                    each_cog_cate]
                for each_gene in each_cog_cate_gene_member:
                    each_gene_depth = gene_depth_dict[each_gene]
                    each_cog_cate_TotalDepth += each_gene_depth
            each_cog_cate_TotalDepth = float(
                "{0:.2f}".format(each_cog_cate_TotalDepth))
            pwd_func_stats_TotalDepth_handle.write(
                '%s\t%s\t%s\n' %
                (each_cog_cate, each_cog_cate_TotalDepth,
                 cog_category_to_description_dict[each_cog_cate]))
        pwd_func_stats_TotalDepth_handle.close()

    #################### get pct files ####################

    AnnotateNorm(file_in=pwd_cog_stats_GeneNumber,
                 skip_header=True,
                 value_column=2,
                 Divisor_value=len(genes_with_cog),
                 file_out=pwd_cog_stats_GeneNumber_pct,
                 file_out_header='Category\tGeneNumber_pct\tDescription\n')
    AnnotateNorm(file_in=pwd_func_stats_GeneNumber,
                 skip_header=True,
                 value_column=2,
                 Divisor_value=len(genes_with_cog),
                 file_out=pwd_func_stats_GeneNumber_pct,
                 file_out_header='Category\tGeneNumber_pct\tDescription\n')
    if depth_file is not None:
        AnnotateNorm(file_in=pwd_cog_stats_TotalDepth,
                     skip_header=True,
                     value_column=2,
                     Divisor_value=genes_with_cog_TotalDepth,
                     file_out=pwd_cog_stats_TotalDepth_pct,
                     file_out_header='Category\tTotalDepth_pct\tDescription\n')
        AnnotateNorm(file_in=pwd_func_stats_TotalDepth,
                     skip_header=True,
                     value_column=2,
                     Divisor_value=genes_with_cog_TotalDepth,
                     file_out=pwd_func_stats_TotalDepth_pct,
                     file_out_header='Category\tTotalDepth_pct\tDescription\n')
    if pct_by_all is True:
        AnnotateNorm(
            file_in=pwd_cog_stats_GeneNumber,
            skip_header=True,
            value_column=2,
            Divisor_value=len(query_seq_list),
            file_out=pwd_cog_stats_GeneNumber_pct_by_all,
            file_out_header='Category\tGeneNumber_pct_by_all\tDescription\n')
        AnnotateNorm(
            file_in=pwd_func_stats_GeneNumber,
            skip_header=True,
            value_column=2,
            Divisor_value=len(query_seq_list),
            file_out=pwd_func_stats_GeneNumber_pct_by_all,
            file_out_header='Category\tGeneNumber_pct_by_all\tDescription\n')
        if depth_file is not None:
            AnnotateNorm(
                file_in=pwd_cog_stats_TotalDepth,
                skip_header=True,
                value_column=2,
                Divisor_value=total_depth_for_all_query_genes,
                file_out=pwd_cog_stats_TotalDepth_pct_by_all,
                file_out_header='Category\tTotalDepth_pct_by_all\tDescription\n'
            )
            AnnotateNorm(
                file_in=pwd_func_stats_TotalDepth,
                skip_header=True,
                value_column=2,
                Divisor_value=total_depth_for_all_query_genes,
                file_out=pwd_func_stats_TotalDepth_pct_by_all,
                file_out_header='Category\tTotalDepth_pct_by_all\tDescription\n'
            )
Esempio n. 8
0
def dbCAN_worker(argument_list):

    pwd_input_file = argument_list[0]
    pwd_hmmscan_parser = argument_list[1]
    pwd_dbCAN_fam_HMMs = argument_list[2]
    sequence_type = argument_list[3]
    output_folder = argument_list[4]
    fam_to_activities_dict = argument_list[5]
    depth_file = argument_list[6]

    input_seq_path, input_seq_no_ext, input_seq_ext = sep_path_basename_ext(
        pwd_input_file)
    current_output_folder = '%s/%s_dbCAN_wd' % (output_folder,
                                                input_seq_no_ext)

    force_create_folder(current_output_folder)

    input_seq_aa = ''
    if (sequence_type == 'N') or (sequence_type == 'n'):
        input_seq_aa = '%s/%s_aa.fasta' % (current_output_folder,
                                           input_seq_no_ext)
        dna2aa(pwd_input_file, input_seq_aa)
    elif (sequence_type == 'P') or (sequence_type == 'p'):
        input_seq_aa = pwd_input_file
    else:
        print('Specified input sequence type unrecognizable, program exited!')
        exit()

    hmmscan_cmd = "hmmscan --domtblout %s/%s.out.dm %s %s > %s/%s.out" % (
        current_output_folder, input_seq_no_ext, pwd_dbCAN_fam_HMMs,
        input_seq_aa, current_output_folder, input_seq_no_ext)

    hmmscan_parser_cmd = "sh %s %s/%s.out.dm > %s/%s.out.dm.ps" % (
        pwd_hmmscan_parser, current_output_folder, input_seq_no_ext,
        current_output_folder, input_seq_no_ext)

    final_cat_cmd = "cat %s/%s.out.dm.ps | awk '$5<1e-18&&$10>0.35' > %s/%s.out.dm.ps.stringent" % (
        current_output_folder, input_seq_no_ext, current_output_folder,
        input_seq_no_ext)

    os.system(hmmscan_cmd)
    os.system(hmmscan_parser_cmd)
    os.system(final_cat_cmd)

    ################################### get functional descriptions for query genes ####################################

    pwd_annotation_results = '%s/%s_dbCAN.txt' % (current_output_folder,
                                                  input_seq_no_ext)
    pwd_annotation_results_stats_GeneNumber = '%s/%s_dbCAN_stats_GeneNumber.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_annotation_results_stats_GeneNumber_pct = '%s/%s_dbCAN_stats_GeneNumber_pct.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_annotation_results_stats_TotalDepth = '%s/%s_dbCAN_stats_TotalDepth.txt' % (
        current_output_folder, input_seq_no_ext)
    pwd_annotation_results_stats_TotalDepth_pct = '%s/%s_dbCAN_stats_TotalDepth_pct.txt' % (
        current_output_folder, input_seq_no_ext)

    # read in depth info
    gene_depth_dict = {}
    if depth_file is not None:
        for each_depth in open(depth_file):
            each_depth_split = each_depth.strip().split('\t')
            gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1])

    # get all sequences in input seq file
    query_seq_list = []
    for query_seq in SeqIO.parse(pwd_input_file, 'fasta'):
        query_seq_list.append(query_seq.id)

    # get total number and depth of all genes in one file
    total_depth_for_all_query_genes = 0
    if depth_file is not None:
        for gene in query_seq_list:
            gene_depth = gene_depth_dict[gene]
            total_depth_for_all_query_genes += gene_depth

    # parse hmmscan results
    pwd_annotation_results_handle = open(pwd_annotation_results, 'w')
    pwd_annotation_results_handle.write('Query\tFamily\tActivities\n')
    hmm_to_gene_member_dict = {}
    for hmm_hit in open('%s/%s.out.dm.ps.stringent' %
                        (current_output_folder, input_seq_no_ext)):
        hmm_hit_split = hmm_hit.strip().split('\t')
        query_id = hmm_hit_split[2]
        matched_hmm = hmm_hit_split[0]
        matched_hmm_id = matched_hmm.split('.hmm')[0]

        # get activities
        matched_hmm_activities = 'NA'
        matched_hmm_id_no_underscore = matched_hmm_id
        if '_' in matched_hmm_id_no_underscore:
            matched_hmm_id_no_underscore = matched_hmm_id_no_underscore.split(
                '_')[0]
        if matched_hmm_id_no_underscore in fam_to_activities_dict:
            matched_hmm_activities = fam_to_activities_dict[
                matched_hmm_id_no_underscore]

        # get hmm_to_num_dict
        if matched_hmm_id not in hmm_to_gene_member_dict:
            hmm_to_gene_member_dict[matched_hmm_id] = [query_id]
        else:
            hmm_to_gene_member_dict[matched_hmm_id].append(query_id)

        # write out
        pwd_annotation_results_handle.write(
            '%s\t%s\t%s\n' % (query_id, matched_hmm, matched_hmm_activities))

    pwd_annotation_results_handle.close()

    #################### get summary of annotation results GeneNumber ####################

    pwd_annotation_results_stats_GeneNumber_handle = open(
        pwd_annotation_results_stats_GeneNumber, 'w')
    pwd_annotation_results_stats_GeneNumber_handle.write(
        'Family\tGeneNumber\tActivities\n')
    total_GeneNumber_identified = 0
    for each_hmm in hmm_to_gene_member_dict:
        each_hmm_id = each_hmm.split('.hmm')[0]
        each_hmm_GeneNumber = len(hmm_to_gene_member_dict[each_hmm_id])

        each_hmm_activities = 'NA'
        matched_hmm_id_no_underscore = each_hmm_id
        if '_' in matched_hmm_id_no_underscore:
            matched_hmm_id_no_underscore = matched_hmm_id_no_underscore.split(
                '_')[0]
        if matched_hmm_id_no_underscore in fam_to_activities_dict:
            each_hmm_activities = fam_to_activities_dict[
                matched_hmm_id_no_underscore]

        pwd_annotation_results_stats_GeneNumber_handle.write(
            '%s\t%s\t%s\n' %
            (each_hmm_id, each_hmm_GeneNumber, each_hmm_activities))
        total_GeneNumber_identified += each_hmm_GeneNumber
    pwd_annotation_results_stats_GeneNumber_handle.close()

    #################### get summary of annotation results GeneNumber pct ####################

    AnnotateNorm(file_in=pwd_annotation_results_stats_GeneNumber,
                 skip_header=True,
                 value_column=2,
                 Divisor_value=total_GeneNumber_identified,
                 file_out=pwd_annotation_results_stats_GeneNumber_pct,
                 file_out_header='Family\tGeneNumber_pct\tActivities\n')

    #################### get summary of annotation results TotalDepth ####################

    if depth_file is not None:
        pwd_annotation_results_stats_TotalDepth_handle = open(
            pwd_annotation_results_stats_TotalDepth, 'w')
        pwd_annotation_results_stats_TotalDepth_handle.write(
            'Family\tTotalDepth\tActivities\n')
        total_depth_identified = 0
        for each_hmm in hmm_to_gene_member_dict:
            each_hmm_id = each_hmm.split('.hmm')[0]
            each_hmm_TotalDepth = 0
            for each_gene in hmm_to_gene_member_dict[each_hmm_id]:
                each_gene_depth = gene_depth_dict[each_gene]
                each_hmm_TotalDepth += each_gene_depth
            each_hmm_TotalDepth = float("{0:.2f}".format(each_hmm_TotalDepth))

            each_hmm_activities = 'NA'
            matched_hmm_id_no_underscore = each_hmm_id
            if '_' in matched_hmm_id_no_underscore:
                matched_hmm_id_no_underscore = matched_hmm_id_no_underscore.split(
                    '_')[0]
            if matched_hmm_id_no_underscore in fam_to_activities_dict:
                each_hmm_activities = fam_to_activities_dict[
                    matched_hmm_id_no_underscore]

            pwd_annotation_results_stats_TotalDepth_handle.write(
                '%s\t%s\t%s\n' %
                (each_hmm_id, each_hmm_TotalDepth, each_hmm_activities))
            total_depth_identified += each_hmm_TotalDepth

        pwd_annotation_results_stats_TotalDepth_handle.close()

        #################### get summary of annotation results TotalDepth pct ####################

        AnnotateNorm(file_in=pwd_annotation_results_stats_TotalDepth,
                     skip_header=True,
                     value_column=2,
                     Divisor_value=total_depth_identified,
                     file_out=pwd_annotation_results_stats_TotalDepth_pct,
                     file_out_header='Family\tTotalDepth_pct\tActivities\n')
Esempio n. 9
0
def dbCAN(args):

    file_in = args['i']
    file_extension = args['x']
    sequence_type = args['m']
    depth_file = args['depth']
    DB_dir = args['db_dir']
    num_threads = args['t']

    pwd_hmmscan_parser = '%s/hmmscan-parser.sh' % DB_dir
    pwd_dbCAN_fam_HMMs = '%s/dbCAN-fam-HMMs.txt' % DB_dir
    CAZyDB_fam_activities = '%s/CAZyDB.fam-activities.txt' % DB_dir

    CAZyDB_fam_activities_07312019 = '%s/CAZyDB.07312019.fam-activities.txt' % DB_dir
    if (os.path.isfile(CAZyDB_fam_activities_07312019) is
            True) and (os.path.isfile(CAZyDB_fam_activities) is False):
        os.system('mv %s %s' %
                  (CAZyDB_fam_activities_07312019, CAZyDB_fam_activities))

    ############################################ check whether db file exist ###########################################

    # check whether db file exist
    unfound_inputs = []
    for each_input in [pwd_hmmscan_parser, pwd_dbCAN_fam_HMMs]:
        if (not os.path.isfile(each_input)) and (
                not os.path.isdir(each_input)):
            unfound_inputs.append(each_input)
    if len(unfound_inputs) > 0:
        for each_unfound in unfound_inputs:
            print('%s not found' % each_unfound)
        exit()

    # store CAZyDB.fam-activities.txt in dict
    fam_to_activities_dict = {}
    for each_fam in open(CAZyDB_fam_activities):
        each_fam_split = each_fam.strip().split('	  ')
        if len(each_fam_split) == 2:
            fam_id = each_fam_split[0]
            fam_activities = each_fam_split[1]
            fam_to_activities_dict[fam_id] = fam_activities

    ################################################## if input is file ################################################

    # if input is file
    if os.path.isfile(file_in) is True:

        # check whether depth file exist
        if depth_file is not None:
            if os.path.isfile(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth file not found, program exited!')
                exit()

        print(datetime.now().strftime(time_format) +
              'Running dbCAN for 1 file with %s cores' % (num_threads))

        file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext(
            file_in)
        dbCAN_worker([
            file_in, pwd_hmmscan_parser, pwd_dbCAN_fam_HMMs, sequence_type,
            file_in_path, fam_to_activities_dict, depth_file
        ])

    ################################################ if input is folder ################################################

    # if input is folder
    else:

        # check whether input folder exist
        if os.path.isdir(file_in) is False:
            print(datetime.now().strftime(time_format) +
                  'input folder not found, program exited!')
            exit()

        else:
            # check whether input genome exist
            input_file_re = '%s/*.%s' % (file_in, file_extension)
            input_file_name_list = [
                os.path.basename(file_name)
                for file_name in glob.glob(input_file_re)
            ]

            if len(input_file_name_list) == 0:
                print(datetime.now().strftime(time_format) +
                      'input file not found, program exited!')
                exit()

            # check whether depth file exist
            if depth_file is not None:

                if os.path.isfile(depth_file) is True:
                    print(
                        datetime.now().strftime(time_format) +
                        'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.'
                    )
                    print(
                        datetime.now().strftime(time_format) +
                        'a single file (not folder) detected, program exited!')
                    exit()

                if os.path.isdir(depth_file) is False:
                    print(datetime.now().strftime(time_format) +
                          'provided depth folder not found, program exited!')
                    exit()

                if os.path.isdir(depth_file) is True:

                    undetected_depth_file = []
                    for input_seq_file in input_file_name_list:
                        input_seq_file_basename = '.'.join(
                            input_seq_file.split('.')[:-1])
                        input_seq_file_depth = '%s/%s.depth' % (
                            depth_file, input_seq_file_basename)
                        if os.path.isfile(input_seq_file_depth) is False:
                            undetected_depth_file.append(input_seq_file_depth)

                    if len(undetected_depth_file) > 0:
                        print(
                            datetime.now().strftime(time_format) +
                            'the following depth files not found, program exited!'
                        )
                        print(','.join(undetected_depth_file))
                        exit()

            ################################################### define file name ###################################################

            if '/' in file_in:
                file_in_folder_name = file_in.split('/')[-1]
            else:
                file_in_folder_name = file_in

            output_folder = '%s_dbCAN_wd' % file_in_folder_name
            annotation_df_GeneNumber = '%s/%s_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_GeneNumber_pct = '%s/%s_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_TotalDepth = '%s/%s_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_TotalDepth_pct = '%s/%s_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)

            # create output folder
            force_create_folder(output_folder)

            ######################################################### main #########################################################

            print(datetime.now().strftime(time_format) +
                  'Running dbCAN for %s input files with %s cores' %
                  (len(input_file_name_list), num_threads))

            list_for_multiple_arguments_dbCAN = []
            for input_file in input_file_name_list:

                input_file_basename = '.'.join(input_file.split('.')[:-1])
                pwd_input_file = '%s/%s' % (file_in, input_file)

                # get path to current depth file
                if depth_file is None:
                    input_file_depth = None
                else:
                    input_file_depth = '%s/%s.depth' % (depth_file,
                                                        input_file_basename)

                list_for_multiple_arguments_dbCAN.append([
                    pwd_input_file, pwd_hmmscan_parser, pwd_dbCAN_fam_HMMs,
                    sequence_type, output_folder, fam_to_activities_dict,
                    input_file_depth
                ])

            # run COG annotaion files with multiprocessing
            pool = mp.Pool(processes=num_threads)
            pool.map(dbCAN_worker, list_for_multiple_arguments_dbCAN)
            pool.close()
            pool.join()

            ######################################################### get dataframe #########################################################

            get_dbCAN_annot_df(output_folder,
                               annotation_df_GeneNumber,
                               annotation_df_GeneNumber_pct,
                               with_depth=False)
            if depth_file is not None:
                get_dbCAN_annot_df(output_folder,
                                   annotation_df_TotalDepth,
                                   annotation_df_TotalDepth_pct,
                                   with_depth=True)

            # report
            print(datetime.now().strftime(time_format) +
                  'Data matrix exported to:')
            print(datetime.now().strftime(time_format) +
                  annotation_df_GeneNumber.split('/')[-1])
            print(datetime.now().strftime(time_format) +
                  annotation_df_GeneNumber_pct.split('/')[-1])
            if depth_file is not None:
                print(datetime.now().strftime(time_format) +
                      annotation_df_TotalDepth.split('/')[-1])
                print(datetime.now().strftime(time_format) +
                      annotation_df_TotalDepth_pct.split('/')[-1])

    print(datetime.now().strftime(time_format) + 'Done!')
Esempio n. 10
0
def COG2014(args):

    file_in = args['i']
    file_extension = args['x']
    sequence_type = args['m']
    depth_file = args['depth']
    pct_by_all = args['pct_by_all']
    DB_dir = args['db_dir']
    num_threads = args['t']
    run_diamond = args['diamond']
    evalue_cutoff = args['evalue']

    pwd_prot2003_2014 = '%s/prot2003-2014.fa' % DB_dir
    pwd_prot2003_2014_diamond = '%s/prot2003-2014.fa.dmnd' % DB_dir
    pwd_prot2003_2014_tab = '%s/prot2003-2014.tab' % DB_dir
    pwd_cog2003_2014 = '%s/cog2003-2014.csv' % DB_dir
    pwd_cognames2003_2014 = '%s/cognames2003-2014.tab' % DB_dir
    pwd_fun2003_2014 = '%s/fun2003-2014.tab' % DB_dir

    ############################################ check whether db file exist ###########################################

    # check whether db file exist
    unfound_inputs = []
    for each_input in [
            pwd_prot2003_2014, pwd_cognames2003_2014, pwd_fun2003_2014
    ]:
        if (not os.path.isfile(each_input)) and (
                not os.path.isdir(each_input)):
            unfound_inputs.append(each_input)
    if len(unfound_inputs) > 0:
        for each_unfound in unfound_inputs:
            print('%s not found' % each_unfound)
        exit()

    # check whether db index esixt
    if run_diamond is False:
        unfound_db_index_file = []
        for db_index in [
                'phr', 'pin', 'pnd', 'pni', 'pog', 'psd', 'psi', 'psq'
        ]:
            pwd_db_index = '%s/prot2003-2014.fa.%s' % (DB_dir, db_index)
            if not os.path.isfile(pwd_db_index):
                unfound_db_index_file.append(db_index)
        if len(unfound_db_index_file) > 0:
            print(datetime.now().strftime(time_format) +
                  'DB index files not found, runing makeblastdb first')
            makeblastdb_cmd = 'makeblastdb -in %s -dbtype prot -parse_seqids -logfile %s.log' % (
                pwd_prot2003_2014, pwd_prot2003_2014)
            os.system(makeblastdb_cmd)
            print(datetime.now().strftime(time_format) +
                  'makeblastdb finished')

    if run_diamond is True:
        if os.path.isfile(pwd_prot2003_2014_diamond) is False:
            print(datetime.now().strftime(time_format) +
                  'DB file not found, making diamond db')
            diamond_makedb_cmd = 'diamond makedb --in %s --db %s --quiet' % (
                pwd_prot2003_2014, pwd_prot2003_2014_diamond)
            os.system(diamond_makedb_cmd)

    ################################################# read db into dict ################################################

    # get refseq_acc_to_protein_id_dict (prot2003-2014.tab)
    refseq_acc_to_protein_id_dict = {}
    for refseq_acc in open(pwd_prot2003_2014_tab):
        refseq_acc_split = refseq_acc.strip().split('\t')
        refseq_acc_to_protein_id_dict[
            refseq_acc_split[1]] = refseq_acc_split[0]

    # get protein_id_to_cog_id_dict (cog2003-2014.csv)
    protein_id_to_cog_id_dict = {}
    for protein_to_cog in open(pwd_cog2003_2014):
        protein_to_cog_split = protein_to_cog.strip().split(',')
        protein_id = protein_to_cog_split[2]
        cog_id = protein_to_cog_split[6]
        protein_id_to_cog_id_dict[protein_id] = cog_id

    # get cog_id_to_category_dict and cog_id_to_description_dict (cognames2003-2014.tab)
    cog_id_to_category_dict = {}
    cog_id_to_description_dict = {}
    for cog_id_to_cate_des in open(pwd_cognames2003_2014,
                                   encoding='windows-1252'):
        if not cog_id_to_cate_des.startswith('#'):
            cog_id_to_cate_des_split = cog_id_to_cate_des.strip().split('\t')
            cog_id = cog_id_to_cate_des_split[0]
            cog_cate = cog_id_to_cate_des_split[1]
            cog_des = cog_id_to_cate_des_split[2]
            cog_id_to_category_dict[cog_id] = cog_cate
            cog_id_to_description_dict[cog_id] = cog_des

    # get cog_category_to_description_dict (fun2003-2014.tab)
    cog_category_list = []
    cog_category_to_description_dict = {}
    for cog_category in open(pwd_fun2003_2014):
        if not cog_category.startswith('#'):
            cog_category_split = cog_category.strip().split('\t')
            cog_category_list.append(cog_category_split[0])
            cog_category_to_description_dict[
                cog_category_split[0]] = cog_category_split[1]

    ################################################## if input is file ################################################

    # if input is file
    if os.path.isfile(file_in) is True:

        # check whether depth file exist
        if depth_file is not None:
            if os.path.isfile(depth_file) is False:
                print(datetime.now().strftime(time_format) +
                      'specified depth file not found, program exited!')
                exit()

        print(datetime.now().strftime(time_format) +
              'Running COG annotation for 1 file with %s cores' %
              (num_threads))

        file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext(
            file_in)

        COG2014_worker([
            file_in, pwd_prot2003_2014, refseq_acc_to_protein_id_dict,
            protein_id_to_cog_id_dict, cog_id_to_category_dict,
            cog_id_to_description_dict, cog_category_list,
            cog_category_to_description_dict, sequence_type, file_in_path,
            num_threads, run_diamond, evalue_cutoff, depth_file, pct_by_all
        ])

    ################################################ if input is folder ################################################

    # if input is folder
    else:

        # check whether input folder exist
        if os.path.isdir(file_in) is False:
            print(datetime.now().strftime(time_format) +
                  'input folder not found, program exited')
            exit()

        else:
            # check whether input genome exist
            input_file_re = '%s/*.%s' % (file_in, file_extension)
            input_file_name_list = [
                os.path.basename(file_name)
                for file_name in glob.glob(input_file_re)
            ]

            if len(input_file_name_list) == 0:
                print(datetime.now().strftime(time_format) +
                      'input file not found, program exited')
                exit()

            # check whether depth file exist
            if depth_file is not None:

                if os.path.isfile(depth_file) is True:
                    print(
                        datetime.now().strftime(time_format) +
                        'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.'
                    )
                    print(
                        datetime.now().strftime(time_format) +
                        'a single file (not folder) detected, program exited!')
                    exit()

                if os.path.isdir(depth_file) is False:
                    print(datetime.now().strftime(time_format) +
                          'provided depth folder not found, program exited!')
                    exit()

                if os.path.isdir(depth_file) is True:

                    undetected_depth_file = []
                    for input_seq_file in input_file_name_list:
                        input_seq_file_basename = '.'.join(
                            input_seq_file.split('.')[:-1])
                        input_seq_file_depth = '%s/%s.depth' % (
                            depth_file, input_seq_file_basename)
                        if os.path.isfile(input_seq_file_depth) is False:
                            undetected_depth_file.append(input_seq_file_depth)

                    if len(undetected_depth_file) > 0:
                        print(
                            datetime.now().strftime(time_format) +
                            'the following depth files not found, program exited!'
                        )
                        print(','.join(undetected_depth_file))
                        exit()

            ################################################### define file name ###################################################

            if '/' in file_in:
                file_in_folder_name = file_in.split('/')[-1]
            else:
                file_in_folder_name = file_in

            output_folder = '%s_COG2014_wd' % file_in_folder_name

            # create output folder
            force_create_folder(output_folder)

            ######################################################### main #########################################################

            print(datetime.now().strftime(time_format) +
                  'Running COG annotation for %s files with %s cores' %
                  (len(input_file_name_list), num_threads))

            list_for_multiple_arguments_COG = []
            for input_file in input_file_name_list:

                input_file_basename = '.'.join(input_file.split('.')[:-1])
                pwd_input_file = '%s/%s' % (file_in, input_file)

                # get path to current depth file
                if depth_file is None:
                    input_file_depth = None
                else:
                    input_file_depth = '%s/%s.depth' % (depth_file,
                                                        input_file_basename)

                list_for_multiple_arguments_COG.append([
                    pwd_input_file, pwd_prot2003_2014,
                    refseq_acc_to_protein_id_dict, protein_id_to_cog_id_dict,
                    cog_id_to_category_dict, cog_id_to_description_dict,
                    cog_category_list, cog_category_to_description_dict,
                    sequence_type, output_folder, 1, run_diamond,
                    evalue_cutoff, input_file_depth, pct_by_all
                ])

            # run COG annotaion files with multiprocessing
            pool = mp.Pool(processes=num_threads)
            pool.map(COG2014_worker, list_for_multiple_arguments_COG)
            pool.close()
            pool.join()

            ######################################################### get dataframe #########################################################

            annotation_df_cog_cate_GeneNumber = '%s/%s_COG2014_cate_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_GeneNumber_pct = '%s/%s_COG2014_cate_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_GeneNumber_pct_by_all = '%s/%s_COG2014_cate_GeneNumber_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_cate_TotalDepth = '%s/%s_COG2014_cate_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_TotalDepth_pct = '%s/%s_COG2014_cate_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_cate_TotalDepth_pct_by_all = '%s/%s_COG2014_cate_TotalDepth_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_id_GeneNumber = '%s/%s_COG2014_id_GeneNumber.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_GeneNumber_pct = '%s/%s_COG2014_id_GeneNumber_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_GeneNumber_pct_by_all = '%s/%s_COG2014_id_GeneNumber_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            annotation_df_cog_id_TotalDepth = '%s/%s_COG2014_id_TotalDepth.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_TotalDepth_pct = '%s/%s_COG2014_id_TotalDepth_pct.txt' % (
                output_folder, file_in_folder_name)
            annotation_df_cog_id_TotalDepth_pct_by_all = '%s/%s_COG2014_id_TotalDepth_pct_by_all.txt' % (
                output_folder, file_in_folder_name)

            print(datetime.now().strftime(time_format) +
                  'Data matrix exported to:')

            # get df
            get_COG_annot_df(output_folder,
                             'cog_cate',
                             annotation_df_cog_cate_GeneNumber,
                             annotation_df_cog_cate_GeneNumber_pct,
                             annotation_df_cog_cate_GeneNumber_pct_by_all,
                             with_depth=False,
                             pct_by_all=False)
            get_COG_annot_df(output_folder,
                             'cog_id',
                             annotation_df_cog_id_GeneNumber,
                             annotation_df_cog_id_GeneNumber_pct,
                             annotation_df_cog_id_GeneNumber_pct_by_all,
                             with_depth=False,
                             pct_by_all=False)
            if pct_by_all is True:
                get_COG_annot_df(output_folder,
                                 'cog_cate',
                                 annotation_df_cog_cate_GeneNumber,
                                 annotation_df_cog_cate_GeneNumber_pct,
                                 annotation_df_cog_cate_GeneNumber_pct_by_all,
                                 with_depth=False,
                                 pct_by_all=True)
                get_COG_annot_df(output_folder,
                                 'cog_id',
                                 annotation_df_cog_id_GeneNumber,
                                 annotation_df_cog_id_GeneNumber_pct,
                                 annotation_df_cog_id_GeneNumber_pct_by_all,
                                 with_depth=False,
                                 pct_by_all=True)

            # report
            if pct_by_all is False:
                print(datetime.now().strftime(time_format) + '%s and %s' %
                      (annotation_df_cog_id_GeneNumber.split('/')[-1],
                       annotation_df_cog_id_GeneNumber_pct.split('/')[-1]))
                print(datetime.now().strftime(time_format) + '%s and %s' %
                      (annotation_df_cog_cate_GeneNumber.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct.split('/')[-1]))
            else:
                print(
                    datetime.now().strftime(time_format) + '%s, %s and %s' %
                    (annotation_df_cog_id_GeneNumber.split('/')[-1],
                     annotation_df_cog_id_GeneNumber_pct.split('/')[-1],
                     annotation_df_cog_id_GeneNumber_pct_by_all.split('/')[-1])
                )
                print(datetime.now().strftime(time_format) + '%s, %s and %s' %
                      (annotation_df_cog_cate_GeneNumber.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct.split('/')[-1],
                       annotation_df_cog_cate_GeneNumber_pct_by_all.split('/')
                       [-1]))

            if depth_file is not None:
                get_COG_annot_df(output_folder,
                                 'cog_cate',
                                 annotation_df_cog_cate_TotalDepth,
                                 annotation_df_cog_cate_TotalDepth_pct,
                                 annotation_df_cog_cate_TotalDepth_pct_by_all,
                                 with_depth=True,
                                 pct_by_all=False)
                get_COG_annot_df(output_folder,
                                 'cog_id',
                                 annotation_df_cog_id_TotalDepth,
                                 annotation_df_cog_id_TotalDepth_pct,
                                 annotation_df_cog_id_TotalDepth_pct_by_all,
                                 with_depth=True,
                                 pct_by_all=False)
                if pct_by_all is True:
                    get_COG_annot_df(
                        output_folder,
                        'cog_cate',
                        annotation_df_cog_cate_TotalDepth,
                        annotation_df_cog_cate_TotalDepth_pct,
                        annotation_df_cog_cate_TotalDepth_pct_by_all,
                        with_depth=True,
                        pct_by_all=True)
                    get_COG_annot_df(
                        output_folder,
                        'cog_id',
                        annotation_df_cog_id_TotalDepth,
                        annotation_df_cog_id_TotalDepth_pct,
                        annotation_df_cog_id_TotalDepth_pct_by_all,
                        with_depth=True,
                        pct_by_all=True)

                # report
                if pct_by_all is False:
                    print(datetime.now().strftime(time_format) + '%s and %s' %
                          (annotation_df_cog_id_TotalDepth.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct.split('/')[-1]))
                    print(
                        datetime.now().strftime(time_format) + '%s and %s' %
                        (annotation_df_cog_cate_TotalDepth.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct.split('/')[-1]))
                else:
                    print(datetime.now().strftime(time_format) +
                          '%s, %s and %s' %
                          (annotation_df_cog_id_TotalDepth.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct.split('/')[-1],
                           annotation_df_cog_id_TotalDepth_pct_by_all.split(
                               '/')[-1]))
                    print(
                        datetime.now().strftime(time_format) +
                        '%s, %s and %s' %
                        (annotation_df_cog_cate_TotalDepth.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct.split('/')[-1],
                         annotation_df_cog_cate_TotalDepth_pct_by_all.split(
                             '/')[-1]))

    ################################################## Final report ####################################################

    print(datetime.now().strftime(time_format) + 'Done!')