def GTDB_for_BLCA(args): GTDB_db_file = args['GTDB_ssu'] GTDB_db_file_path, GTDB_db_file_basename, GTDB_db_file_ext = sep_path_basename_ext(GTDB_db_file) file_out_sequence = '%s/%s_BLCAparsed.fasta' % (GTDB_db_file_path, GTDB_db_file_basename) file_out_taxonomy = '%s/%s_BLCAparsed.taxonomy' % (GTDB_db_file_path, GTDB_db_file_basename) rank_list = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] file_out_sequence_handle = open(file_out_sequence, 'w') file_out_taxonomy_handle = open(file_out_taxonomy, 'w') for seq_record in SeqIO.parse(GTDB_db_file, 'fasta'): seq_record_taxon_split = ' '.join(seq_record.description.strip().split(' [')[0].split(' ')[1:]).split(';') seq_record_taxon_split_no_rank = [i[3:] for i in seq_record_taxon_split] seq_record_taxon_split_no_rank_reverse = seq_record_taxon_split_no_rank[::-1] GTDB_seq_taxon_str = '' n = 0 for taxon_rank in rank_list: GTDB_seq_taxon_str += '%s:%s;' % (taxon_rank, seq_record_taxon_split_no_rank_reverse[n]) n += 1 # write out to taxonomy file file_out_taxonomy_handle.write('%s\t%s\n' % (seq_record.id, GTDB_seq_taxon_str)) # write out to sequence file file_out_sequence_handle.write('>%s\n' % seq_record.id) file_out_sequence_handle.write('%s\n' % seq_record.seq) file_out_sequence_handle.close() file_out_taxonomy_handle.close()
def reads2bam(args): output_prefix = args['p'] ref_seq = args['ref'] index_ref = args['index_ref'] r1_seq = args['r1'] r2_seq = args['r2'] unpaired_seq = args['u'] fq_format = args['fastq'] thread_num = args['t'] keep_tmp = args['tmp'] ref_path, ref_basename, ref_ext = sep_path_basename_ext(ref_seq) cmd_bowtie2_build = 'bowtie2-build -f %s %s --threads %s' % (ref_seq, ref_basename, thread_num) cmd_bowtie2 = '' if (r1_seq is not None) and (r2_seq is not None) and (unpaired_seq is None): cmd_bowtie2 = 'bowtie2 -x %s -1 %s -2 %s -S %s.sam -p %s -f' % (ref_basename, r1_seq, r2_seq, output_prefix, thread_num) if fq_format is True: cmd_bowtie2 = 'bowtie2 -x %s -1 %s -2 %s -S %s.sam -p %s -q' % (ref_basename, r1_seq, r2_seq, output_prefix, thread_num) elif (r1_seq is not None) and (r2_seq is not None) and (unpaired_seq is not None): cmd_bowtie2 = 'bowtie2 -x %s -1 %s -2 %s -U %s -S %s.sam -p %s -f' % (ref_basename, r1_seq, r2_seq, unpaired_seq, output_prefix, thread_num) if fq_format is True: cmd_bowtie2 = 'bowtie2 -x %s -1 %s -2 %s -U %s -S %s.sam -p %s -q' % (ref_basename, r1_seq, r2_seq, unpaired_seq, output_prefix, thread_num) elif (r1_seq is None) and (r2_seq is None) and (unpaired_seq is not None): cmd_bowtie2 = 'bowtie2 -x %s -U %s -S %s.sam -p %s -f' % (ref_basename, unpaired_seq, output_prefix, thread_num) if fq_format is True: cmd_bowtie2 = 'bowtie2 -x %s -U %s -S %s.sam -p %s -q' % (ref_basename, unpaired_seq, output_prefix, thread_num) else: print('Please check your input reads files') exit() cmd_samtools_view = 'samtools view -bS %s.sam -o %s.bam' % (output_prefix, output_prefix) cmd_samtools_sort = 'samtools sort %s.bam -o %s_sorted.bam' % (output_prefix, output_prefix) cmd_samtools_index = 'samtools index %s_sorted.bam' % output_prefix if index_ref is True: os.system(cmd_bowtie2_build) os.system(cmd_bowtie2) os.system(cmd_samtools_view) os.system(cmd_samtools_sort) os.system(cmd_samtools_index) if keep_tmp is False: os.system('rm %s.sam' % output_prefix) os.system('rm %s.bam' % output_prefix)
def sam2bam(args): sam_in = args['sam'] sam_path, sam_basename, sam_ext = sep_path_basename_ext(sam_in) pwd_bam = '%s/%s.bam' % (sam_path, sam_basename) pwd_bam_sorted = '%s/%s_sorted.bam' % (sam_path, sam_basename) cmd_samtools_view = 'samtools view -bS %s -o %s' % (sam_in, pwd_bam) cmd_samtools_sort = 'samtools sort %s -o %s' % (pwd_bam, pwd_bam_sorted) cmd_samtools_index = 'samtools index %s' % pwd_bam_sorted os.system(cmd_samtools_view) os.system(cmd_samtools_sort) os.system(cmd_samtools_index) os.system('rm %s' % pwd_bam)
def download_GenBank_genome(args): csv_file = args['csv'] get_fna = args['fna'] get_faa = args['faa'] get_gbff = args['gbff'] with_name = args['name'] num_threads = args['t'] time_format = '[%Y-%m-%d %H:%M:%S] ' if (get_fna is False) and (get_faa is False) and (get_gbff is False): print( datetime.now().strftime(time_format) + 'Please specify at least one file type to download, program exited' ) exit() in_file_path, in_file_basename, in_file_extension = sep_path_basename_ext( csv_file) downloaded_genome_folder = '%s_genomes' % in_file_basename force_create_folder(downloaded_genome_folder) # report print(datetime.now().strftime(time_format) + 'Downloading genomes with %s cores' % (num_threads)) # download genome with multiprocessing list_for_multiple_arguments_download_worker = [] for genome_record in open(csv_file): if not genome_record.startswith('#Organism Name'): genome_record_split = genome_record.strip().split(',') list_for_multiple_arguments_download_worker.append([ genome_record_split, downloaded_genome_folder, get_fna, get_faa, get_gbff, with_name ]) # run COG annotaion files with multiprocessing pool = mp.Pool(processes=num_threads) pool.map(genome_download_worker, list_for_multiple_arguments_download_worker) pool.close() pool.join()
def run_blast_worker(argument_list): pwd_input_file = argument_list[0] run_blast = argument_list[1] run_diamond = argument_list[2] KEGG_DB_seq = argument_list[3] KEGG_DB_seq_diamond = argument_list[4] op_dir = argument_list[5] evalue_cutoff = argument_list[6] threads_num = argument_list[7] ################################################### define file name ################################################### input_file_path, in_file_basename, input_file_ext = sep_path_basename_ext( pwd_input_file) blast_results = '%s/%s_KEGG_wd/%s_blast.tab' % (op_dir, in_file_basename, in_file_basename) blast_results_best_hit = '%s/%s_KEGG_wd/%s_blast_best_hits.tab' % ( op_dir, in_file_basename, in_file_basename) # create output folder force_create_folder('%s/%s_KEGG_wd' % (op_dir, in_file_basename)) ########################################## blast against KEGG database (Shan) ########################################## if run_blast is True: if run_diamond is False: blastp_cmd = 'blastp -query %s -db %s -out %s -outfmt 6 -evalue %s -num_alignments 10 -num_threads %s' % ( pwd_input_file, KEGG_DB_seq, blast_results, evalue_cutoff, threads_num) os.system(blastp_cmd) else: diamond_cmd = 'diamond blastp -q %s --db %s --out %s --outfmt 6 --evalue %s --block-size 1 --threads %s --quiet' % ( pwd_input_file, KEGG_DB_seq_diamond, blast_results, evalue_cutoff, threads_num) os.system(diamond_cmd) # only keep the best hit keep_blast_hit_with_highest_bit_score(blast_results, blast_results_best_hit)
def dbCAN_worker(argument_list): pwd_input_file = argument_list[0] pwd_hmmscan_parser = argument_list[1] pwd_dbCAN_fam_HMMs = argument_list[2] sequence_type = argument_list[3] output_folder = argument_list[4] fam_to_activities_dict = argument_list[5] depth_file = argument_list[6] input_seq_path, input_seq_no_ext, input_seq_ext = sep_path_basename_ext( pwd_input_file) current_output_folder = '%s/%s_dbCAN_wd' % (output_folder, input_seq_no_ext) force_create_folder(current_output_folder) input_seq_aa = '' if (sequence_type == 'N') or (sequence_type == 'n'): input_seq_aa = '%s/%s_aa.fasta' % (current_output_folder, input_seq_no_ext) dna2aa(pwd_input_file, input_seq_aa) elif (sequence_type == 'P') or (sequence_type == 'p'): input_seq_aa = pwd_input_file else: print('Specified input sequence type unrecognizable, program exited!') exit() hmmscan_cmd = "hmmscan --domtblout %s/%s.out.dm %s %s > %s/%s.out" % ( current_output_folder, input_seq_no_ext, pwd_dbCAN_fam_HMMs, input_seq_aa, current_output_folder, input_seq_no_ext) hmmscan_parser_cmd = "sh %s %s/%s.out.dm > %s/%s.out.dm.ps" % ( pwd_hmmscan_parser, current_output_folder, input_seq_no_ext, current_output_folder, input_seq_no_ext) final_cat_cmd = "cat %s/%s.out.dm.ps | awk '$5<1e-18&&$10>0.35' > %s/%s.out.dm.ps.stringent" % ( current_output_folder, input_seq_no_ext, current_output_folder, input_seq_no_ext) os.system(hmmscan_cmd) os.system(hmmscan_parser_cmd) os.system(final_cat_cmd) ################################### get functional descriptions for query genes #################################### pwd_annotation_results = '%s/%s_dbCAN.txt' % (current_output_folder, input_seq_no_ext) pwd_annotation_results_stats_GeneNumber = '%s/%s_dbCAN_stats_GeneNumber.txt' % ( current_output_folder, input_seq_no_ext) pwd_annotation_results_stats_GeneNumber_pct = '%s/%s_dbCAN_stats_GeneNumber_pct.txt' % ( current_output_folder, input_seq_no_ext) pwd_annotation_results_stats_TotalDepth = '%s/%s_dbCAN_stats_TotalDepth.txt' % ( current_output_folder, input_seq_no_ext) pwd_annotation_results_stats_TotalDepth_pct = '%s/%s_dbCAN_stats_TotalDepth_pct.txt' % ( current_output_folder, input_seq_no_ext) # read in depth info gene_depth_dict = {} if depth_file is not None: for each_depth in open(depth_file): each_depth_split = each_depth.strip().split('\t') gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1]) # get all sequences in input seq file query_seq_list = [] for query_seq in SeqIO.parse(pwd_input_file, 'fasta'): query_seq_list.append(query_seq.id) # get total number and depth of all genes in one file total_depth_for_all_query_genes = 0 if depth_file is not None: for gene in query_seq_list: gene_depth = gene_depth_dict[gene] total_depth_for_all_query_genes += gene_depth # parse hmmscan results pwd_annotation_results_handle = open(pwd_annotation_results, 'w') pwd_annotation_results_handle.write('Query\tFamily\tActivities\n') hmm_to_gene_member_dict = {} for hmm_hit in open('%s/%s.out.dm.ps.stringent' % (current_output_folder, input_seq_no_ext)): hmm_hit_split = hmm_hit.strip().split('\t') query_id = hmm_hit_split[2] matched_hmm = hmm_hit_split[0] matched_hmm_id = matched_hmm.split('.hmm')[0] # get activities matched_hmm_activities = 'NA' matched_hmm_id_no_underscore = matched_hmm_id if '_' in matched_hmm_id_no_underscore: matched_hmm_id_no_underscore = matched_hmm_id_no_underscore.split( '_')[0] if matched_hmm_id_no_underscore in fam_to_activities_dict: matched_hmm_activities = fam_to_activities_dict[ matched_hmm_id_no_underscore] # get hmm_to_num_dict if matched_hmm_id not in hmm_to_gene_member_dict: hmm_to_gene_member_dict[matched_hmm_id] = [query_id] else: hmm_to_gene_member_dict[matched_hmm_id].append(query_id) # write out pwd_annotation_results_handle.write( '%s\t%s\t%s\n' % (query_id, matched_hmm, matched_hmm_activities)) pwd_annotation_results_handle.close() #################### get summary of annotation results GeneNumber #################### pwd_annotation_results_stats_GeneNumber_handle = open( pwd_annotation_results_stats_GeneNumber, 'w') pwd_annotation_results_stats_GeneNumber_handle.write( 'Family\tGeneNumber\tActivities\n') total_GeneNumber_identified = 0 for each_hmm in hmm_to_gene_member_dict: each_hmm_id = each_hmm.split('.hmm')[0] each_hmm_GeneNumber = len(hmm_to_gene_member_dict[each_hmm_id]) each_hmm_activities = 'NA' matched_hmm_id_no_underscore = each_hmm_id if '_' in matched_hmm_id_no_underscore: matched_hmm_id_no_underscore = matched_hmm_id_no_underscore.split( '_')[0] if matched_hmm_id_no_underscore in fam_to_activities_dict: each_hmm_activities = fam_to_activities_dict[ matched_hmm_id_no_underscore] pwd_annotation_results_stats_GeneNumber_handle.write( '%s\t%s\t%s\n' % (each_hmm_id, each_hmm_GeneNumber, each_hmm_activities)) total_GeneNumber_identified += each_hmm_GeneNumber pwd_annotation_results_stats_GeneNumber_handle.close() #################### get summary of annotation results GeneNumber pct #################### AnnotateNorm(file_in=pwd_annotation_results_stats_GeneNumber, skip_header=True, value_column=2, Divisor_value=total_GeneNumber_identified, file_out=pwd_annotation_results_stats_GeneNumber_pct, file_out_header='Family\tGeneNumber_pct\tActivities\n') #################### get summary of annotation results TotalDepth #################### if depth_file is not None: pwd_annotation_results_stats_TotalDepth_handle = open( pwd_annotation_results_stats_TotalDepth, 'w') pwd_annotation_results_stats_TotalDepth_handle.write( 'Family\tTotalDepth\tActivities\n') total_depth_identified = 0 for each_hmm in hmm_to_gene_member_dict: each_hmm_id = each_hmm.split('.hmm')[0] each_hmm_TotalDepth = 0 for each_gene in hmm_to_gene_member_dict[each_hmm_id]: each_gene_depth = gene_depth_dict[each_gene] each_hmm_TotalDepth += each_gene_depth each_hmm_TotalDepth = float("{0:.2f}".format(each_hmm_TotalDepth)) each_hmm_activities = 'NA' matched_hmm_id_no_underscore = each_hmm_id if '_' in matched_hmm_id_no_underscore: matched_hmm_id_no_underscore = matched_hmm_id_no_underscore.split( '_')[0] if matched_hmm_id_no_underscore in fam_to_activities_dict: each_hmm_activities = fam_to_activities_dict[ matched_hmm_id_no_underscore] pwd_annotation_results_stats_TotalDepth_handle.write( '%s\t%s\t%s\n' % (each_hmm_id, each_hmm_TotalDepth, each_hmm_activities)) total_depth_identified += each_hmm_TotalDepth pwd_annotation_results_stats_TotalDepth_handle.close() #################### get summary of annotation results TotalDepth pct #################### AnnotateNorm(file_in=pwd_annotation_results_stats_TotalDepth, skip_header=True, value_column=2, Divisor_value=total_depth_identified, file_out=pwd_annotation_results_stats_TotalDepth_pct, file_out_header='Family\tTotalDepth_pct\tActivities\n')
def parse_blast_op_worker(argument_list): pwd_input_file = argument_list[0] run_blast = argument_list[1] As_description_dict = argument_list[2] Bs_description_dict = argument_list[3] Cs_description_dict = argument_list[4] Ds_description_dict = argument_list[5] D2ABCD_dict = argument_list[6] db_seq_to_KO_dict = argument_list[7] op_dir = argument_list[8] depth_file = argument_list[9] pct_by_all = argument_list[10] ################################################### define file name ################################################### input_file_path, in_file_basename, input_file_ext = sep_path_basename_ext( pwd_input_file) blast_results_best_hit = '%s/%s_KEGG_wd/%s_blast_best_hits.tab' % ( op_dir, in_file_basename, in_file_basename) KO_assignment_file_D = '%s/%s_KEGG_wd/%s_KO_assignment_D.txt' % ( op_dir, in_file_basename, in_file_basename) KO_assignment_file_DCBA = '%s/%s_KEGG_wd/%s_ko_assignment_ABCD.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_A_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_B_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_C_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_D_GeneNumber = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_A_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_B_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_C_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_D_TotalDepth = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_A_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber_pct.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_B_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber_pct.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_C_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber_pct.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_D_GeneNumber_pct = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber_pct.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_A_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth_pct.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_B_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth_pct.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_C_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth_pct.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_D_TotalDepth_pct = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth_pct.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_A_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_A_GeneNumber_pct_by_all.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_B_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_B_GeneNumber_pct_by_all.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_C_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_C_GeneNumber_pct_by_all.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_D_GeneNumber_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_D_GeneNumber_pct_by_all.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_A_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_A_TotalDepth_pct_by_all.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_B_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_B_TotalDepth_pct_by_all.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_C_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_C_TotalDepth_pct_by_all.txt' % ( op_dir, in_file_basename, in_file_basename) stats_file_D_TotalDepth_pct_by_all = '%s/%s_KEGG_wd/%s_ko_stats_D_TotalDepth_pct_by_all.txt' % ( op_dir, in_file_basename, in_file_basename) ################################################# parse blast results ################################################## if run_blast is True: # store blast results in dict query_to_db_seq_dict = {} for each_query in open(blast_results_best_hit): each_query_split = each_query.strip().split('\t') query_id = each_query_split[0] db_seq = each_query_split[1] query_to_db_seq_dict[query_id] = db_seq # get all query sequence id query_seq_id_list = [] for each_seq in SeqIO.parse(pwd_input_file, 'fasta'): query_seq_id_list.append(str(each_seq.id)) # get ko id at level D for all query genes KO_assignment_file_handle = open(KO_assignment_file_D, 'w') for each_query_seq in sorted(query_seq_id_list): if each_query_seq in query_to_db_seq_dict: db_hit_id = query_to_db_seq_dict[each_query_seq] if db_hit_id in db_seq_to_KO_dict: db_hit_id_ko = db_seq_to_KO_dict[db_hit_id] if ',' in db_hit_id_ko: db_hit_id_ko_split = db_hit_id_ko.split(',') for each_db_hit_id_ko in db_hit_id_ko_split: KO_assignment_file_handle.write( '%s\t%s\n' % (each_query_seq, each_db_hit_id_ko)) else: KO_assignment_file_handle.write( '%s\t%s\n' % (each_query_seq, db_hit_id_ko)) else: KO_assignment_file_handle.write('%s\n' % (each_query_seq)) else: KO_assignment_file_handle.write('%s\n' % (each_query_seq)) KO_assignment_file_handle.close() else: KO_assignment_file_D = pwd_input_file # get ko id at all levels for all query genes ko_assign_ABCD_handle = open(KO_assignment_file_DCBA, 'w') ko_assign_ABCD_handle.write( 'Gene_id\tko_A\tko_B\tko_C\tko_D\tDesc_A\tDesc_B\tDesc_C\tDesc_D\n') query_seq_id_all = set() genes_with_ko = set() for query_gene in open(KO_assignment_file_D): query_gene_split = query_gene.strip().split('\t') gene_ID = query_gene_split[0] if len(query_gene_split) == 1: query_seq_id_all.add(query_gene_split[0]) ko_assign_ABCD_handle.write('%s\n' % gene_ID) if len(query_gene_split) == 2: query_seq_id_all.add(query_gene_split[0]) genes_with_ko.add(query_gene_split[0]) KO_ID = query_gene_split[1] if KO_ID in D2ABCD_dict: KO_ID_ABCD = D2ABCD_dict[KO_ID] if len(KO_ID_ABCD) == 1: KO_DCBA_list = KO_ID_ABCD[0].split('|')[::-1] KO_DCBA_list_only_id = [ i.split('_')[1] for i in KO_DCBA_list ] desc_A = As_description_dict[KO_DCBA_list_only_id[3]] desc_B = Bs_description_dict[KO_DCBA_list_only_id[2]] desc_C = Cs_description_dict[KO_DCBA_list_only_id[1]] desc_D = Ds_description_dict[KO_DCBA_list_only_id[0]] ko_assign_ABCD_handle.write( '%s\t%s\t%s\t%s\t%s\t%s\n' % (gene_ID, '\t'.join(KO_DCBA_list[::-1]), desc_A, desc_B, desc_C, desc_D)) if len(KO_ID_ABCD) > 1: for each_ABCD in KO_ID_ABCD: each_KO_DCBA_list = each_ABCD.split('|')[::-1] each_KO_DCBA_list_only_id = [ i.split('_')[1] for i in each_KO_DCBA_list ] each_desc_A = As_description_dict[ each_KO_DCBA_list_only_id[3]] each_desc_B = Bs_description_dict[ each_KO_DCBA_list_only_id[2]] each_desc_C = Cs_description_dict[ each_KO_DCBA_list_only_id[1]] each_desc_D = Ds_description_dict[ each_KO_DCBA_list_only_id[0]] ko_assign_ABCD_handle.write( '%s\t%s\t%s\t%s\t%s\t%s\n' % (gene_ID, '\t'.join( each_KO_DCBA_list[::-1]), each_desc_A, each_desc_B, each_desc_C, each_desc_D)) ko_assign_ABCD_handle.close() ##################################################### Get summary ###################################################### # read in depth info gene_depth_dict = {} if depth_file is not None: for each_depth in open(depth_file): each_depth_split = each_depth.strip().split('\t') gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1]) # get total number and depth of all genes in one file total_depth_for_all_query_genes = 0 genes_with_ko_TotalDepth = 0 if depth_file is not None: for gene in query_seq_id_all: gene_depth = gene_depth_dict[gene] total_depth_for_all_query_genes += gene_depth genes_with_ko_TotalDepth = get_gene_list_TotalDepth( genes_with_ko, gene_depth_dict) identified_ko_A_list = [] identified_ko_B_list = [] identified_ko_C_list = [] identified_ko_D_list = [] ko_A_to_gene_member_dict = {} ko_B_to_gene_member_dict = {} ko_C_to_gene_member_dict = {} ko_D_to_gene_member_dict = {} ko_NA_to_gene_member_list = [] for each_query in open(KO_assignment_file_DCBA): if not each_query.startswith('Gene_id'): each_query_split = each_query.strip().split('\t') query_id = each_query_split[0] if len(each_query_split) == 1: ko_NA_to_gene_member_list.append(query_id) if len(each_query_split) > 1: query_ko_A = each_query_split[1] query_ko_B = each_query_split[2] query_ko_C = each_query_split[3] query_ko_D = each_query_split[4] if query_ko_A not in identified_ko_A_list: identified_ko_A_list.append(query_ko_A) if query_ko_B not in identified_ko_B_list: identified_ko_B_list.append(query_ko_B) if query_ko_C not in identified_ko_C_list: identified_ko_C_list.append(query_ko_C) if query_ko_D not in identified_ko_D_list: identified_ko_D_list.append(query_ko_D) if query_ko_A not in ko_A_to_gene_member_dict: ko_A_to_gene_member_dict[query_ko_A] = [query_id] else: if query_id not in ko_A_to_gene_member_dict[query_ko_A]: ko_A_to_gene_member_dict[query_ko_A].append(query_id) if query_ko_B not in ko_B_to_gene_member_dict: ko_B_to_gene_member_dict[query_ko_B] = [query_id] else: if query_id not in ko_B_to_gene_member_dict[query_ko_B]: ko_B_to_gene_member_dict[query_ko_B].append(query_id) if query_ko_C not in ko_C_to_gene_member_dict: ko_C_to_gene_member_dict[query_ko_C] = [query_id] else: if query_id not in ko_C_to_gene_member_dict[query_ko_C]: ko_C_to_gene_member_dict[query_ko_C].append(query_id) if query_ko_D not in ko_D_to_gene_member_dict: ko_D_to_gene_member_dict[query_ko_D] = [query_id] else: if query_id not in ko_D_to_gene_member_dict[query_ko_D]: ko_D_to_gene_member_dict[query_ko_D].append(query_id) #################### write out GeneNumber and TotalDepth stats #################### write_out_stats_GeneNumber(identified_ko_A_list, ko_A_to_gene_member_dict, As_description_dict, stats_file_A_GeneNumber) write_out_stats_GeneNumber(identified_ko_B_list, ko_B_to_gene_member_dict, Bs_description_dict, stats_file_B_GeneNumber) write_out_stats_GeneNumber(identified_ko_C_list, ko_C_to_gene_member_dict, Cs_description_dict, stats_file_C_GeneNumber) write_out_stats_GeneNumber(identified_ko_D_list, ko_D_to_gene_member_dict, Ds_description_dict, stats_file_D_GeneNumber) if depth_file is not None: write_out_stats_TotalDepth(identified_ko_A_list, ko_A_to_gene_member_dict, gene_depth_dict, As_description_dict, stats_file_A_TotalDepth) write_out_stats_TotalDepth(identified_ko_B_list, ko_B_to_gene_member_dict, gene_depth_dict, Bs_description_dict, stats_file_B_TotalDepth) write_out_stats_TotalDepth(identified_ko_C_list, ko_C_to_gene_member_dict, gene_depth_dict, Cs_description_dict, stats_file_C_TotalDepth) write_out_stats_TotalDepth(identified_ko_D_list, ko_D_to_gene_member_dict, gene_depth_dict, Ds_description_dict, stats_file_D_TotalDepth) #################### write out GeneNumber and TotalDepth stats (pct) #################### AnnotateNorm(stats_file_A_GeneNumber, True, 2, len(genes_with_ko), stats_file_A_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n') AnnotateNorm(stats_file_B_GeneNumber, True, 2, len(genes_with_ko), stats_file_B_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n') AnnotateNorm(stats_file_C_GeneNumber, True, 2, len(genes_with_ko), stats_file_C_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n') AnnotateNorm(stats_file_D_GeneNumber, True, 2, len(genes_with_ko), stats_file_D_GeneNumber_pct, 'KO\tGeneNumber_pct\tDescription\n') if depth_file is not None: AnnotateNorm(stats_file_A_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_A_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n') AnnotateNorm(stats_file_B_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_B_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n') AnnotateNorm(stats_file_C_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_C_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n') AnnotateNorm(stats_file_D_TotalDepth, True, 2, genes_with_ko_TotalDepth, stats_file_D_TotalDepth_pct, 'KO\tTotalDepth_pct\tDescription\n') #################### write out GeneNumber and TotalDepth stats (pct_by_all) #################### if pct_by_all is True: AnnotateNorm(stats_file_A_GeneNumber, True, 2, len(query_seq_id_all), stats_file_A_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n') AnnotateNorm(stats_file_B_GeneNumber, True, 2, len(query_seq_id_all), stats_file_B_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n') AnnotateNorm(stats_file_C_GeneNumber, True, 2, len(query_seq_id_all), stats_file_C_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n') AnnotateNorm(stats_file_D_GeneNumber, True, 2, len(query_seq_id_all), stats_file_D_GeneNumber_pct_by_all, 'KO\tGeneNumber_pct_by_all\tDescription\n') if depth_file is not None: AnnotateNorm(stats_file_A_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_A_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n') AnnotateNorm(stats_file_B_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_B_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n') AnnotateNorm(stats_file_C_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_C_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n') AnnotateNorm(stats_file_D_TotalDepth, True, 2, total_depth_for_all_query_genes, stats_file_D_TotalDepth_pct_by_all, 'KO\tTotalDepth_pct_by_all\tDescription\n')
def Annotation_KEGG(args): input_file_faa = args['seq_in'] input_file_user_ko = args['ko_in'] file_extension = args['x'] depth_file = args['depth'] pct_by_all = args['pct_by_all'] KEGG_DB_folder = args['db_dir'] run_diamond = args['diamond'] num_threads = args['t'] evalue_cutoff = args['evalue'] run_blast = None if (input_file_faa is not None) and (input_file_user_ko is None): run_blast = True elif (input_file_faa is None) and (input_file_user_ko is not None): run_blast = False else: print( datetime.now().strftime(time_format) + 'Please provide input file with either "-seq_in" or "-ko_in", do not provide both' ) exit() if run_blast is True: input_file_folder = input_file_faa else: input_file_folder = input_file_user_ko # check whether input file/folder exist if (os.path.isfile(input_file_folder) is False) and (os.path.isdir(input_file_folder) is False): print(datetime.now().strftime(time_format) + 'input file/folder not found, program exited') exit() if run_blast is True: print(datetime.now().strftime(time_format) + 'Input sequence file detected, will run blastp/diamond first') sleep(0.5) else: print(datetime.now().strftime(time_format) + 'Annotation results provided, blastp/diamond skipped') sleep(0.5) ################################################# define file name ################################################# KEGG_DB_seq = '%s/kegg_db_seq.fasta' % KEGG_DB_folder KEGG_DB_seq_diamond = '%s/kegg_db_seq.fasta.dmnd' % KEGG_DB_folder KEGG_DB_seq2ko = '%s/kegg_db_seq2ko.txt' % KEGG_DB_folder KEGG_DB_ko = '%s/ko00001.keg' % KEGG_DB_folder ########################################## check whether diamond db exist ########################################## if (run_blast is True) and (run_diamond is True): if os.path.isfile(KEGG_DB_seq_diamond) is False: print(datetime.now().strftime(time_format) + 'DB file not found, making diamond db with %s' % KEGG_DB_seq) if os.path.isfile(KEGG_DB_seq) is True: diamond_makedb_cmd = 'diamond makedb --in %s --db %s --quiet' % ( KEGG_DB_seq, KEGG_DB_seq_diamond) os.system(diamond_makedb_cmd) else: print(datetime.now().strftime(time_format) + '%s not found, program exited' % KEGG_DB_seq) exit() ########################################### check whether blast+ db exist ########################################## if (run_blast is True) and (run_diamond is False): unfound_db_index_file = [] for db_index in [ 'phr', 'pin', 'pnd', 'pni', 'pog', 'psd', 'psi', 'psq' ]: pwd_db_index = '%s/kegg_db_seq.fasta.%s' % (KEGG_DB_folder, db_index) if not os.path.isfile(pwd_db_index): unfound_db_index_file.append(db_index) if len(unfound_db_index_file) > 0: print(datetime.now().strftime(time_format) + 'blast db index not found, runing makeblastdb first') makeblastdb_cmd = 'makeblastdb -in %s -dbtype prot -parse_seqids -logfile %s.log' % ( KEGG_DB_seq, KEGG_DB_seq) os.system(makeblastdb_cmd) print(datetime.now().strftime(time_format) + 'makeblastdb finished') ######################################### Run blastp with multiprocessing ########################################## # check whether the input file is a file or folder if os.path.isfile(input_file_folder) is True: input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext( input_file_folder) run_blast_worker([ input_file_folder, run_blast, run_diamond, KEGG_DB_seq, KEGG_DB_seq_diamond, input_file_path, evalue_cutoff, num_threads ]) if os.path.isdir(input_file_folder) is True: # create output folder output_folder = '%s_KEGG_wd' % input_file_folder force_create_folder(output_folder) # check whether input genome exist input_file_re = '%s/*.%s' % (input_file_folder, file_extension) input_file_name_list = [ os.path.basename(file_name) for file_name in glob.glob(input_file_re) ] if len(input_file_name_list) == 0: print(datetime.now().strftime(time_format) + 'input file not found, program exited') exit() # run blastp with multiprocessing if run_blast is True: print(datetime.now().strftime(time_format) + 'Running Blast/Diamond for %s input files with %s cores' % (len(input_file_name_list), num_threads)) list_for_multiple_arguments_blast = [] for input_file in input_file_name_list: pwd_input_file = '%s/%s' % (input_file_folder, input_file) list_for_multiple_arguments_blast.append([ pwd_input_file, run_blast, run_diamond, KEGG_DB_seq, KEGG_DB_seq_diamond, output_folder, evalue_cutoff, 1 ]) # run blastp with multiprocessing pool = mp.Pool(processes=num_threads) pool.map(run_blast_worker, list_for_multiple_arguments_blast) pool.close() pool.join() ############################################## Read in KEGG DB files ############################################### print(datetime.now().strftime(time_format) + 'Read in KEGG DB files') As_description_dict = {} Bs_description_dict = {} Cs_description_dict = {} Ds_description_dict = {} D2ABCD_dict = {} current_A = '' current_B = '' current_C = '' for each_line in open(KEGG_DB_ko): if each_line[0] in ['A', 'B', 'C', 'D']: each_line_split = each_line.strip().split(' ') if each_line[0] == 'A': current_A_id = each_line_split[0] current_A_description = ' '.join(each_line_split[1:]) current_A = current_A_id As_description_dict[current_A_id] = current_A_description elif each_line[0] == 'B': if len(each_line_split) > 1: current_B_id = each_line_split[2] current_B_description = ' '.join(each_line_split[3:]) current_B = current_B_id Bs_description_dict[current_B_id] = current_B_description elif each_line[0] == 'C': current_C_id = each_line_split[4] current_C_description = ' '.join(each_line_split[5:]) current_C = current_C_id Cs_description_dict[current_C_id] = current_C_description elif each_line[0] == 'D': current_D_id = each_line_split[6] current_D_description = ' '.join(each_line_split[7:]) Ds_description_dict[current_D_id] = current_D_description ABCD_value = 'A_%s|B_%s|C_%s|D_%s' % (current_A, current_B, current_C, current_D_id) if current_D_id not in D2ABCD_dict: D2ABCD_dict[current_D_id] = [ABCD_value] elif (current_D_id in D2ABCD_dict) and (ABCD_value not in D2ABCD_dict[current_D_id]): D2ABCD_dict[current_D_id].append(ABCD_value) # get db_seq_to_KO_dict db_seq_to_KO_dict = {} if run_blast is True: for each_hit in open(KEGG_DB_seq2ko): each_hit_split = each_hit.strip().split('\t') db_seq = each_hit_split[0] hit_id_KO = each_hit_split[1] if hit_id_KO != '': db_seq_to_KO_dict[db_seq] = hit_id_KO ######################################################################################################################## # check whether the input file is a file or folder if os.path.isfile(input_file_folder) is True: # check whether depth file exist if depth_file is not None: if os.path.isfile(depth_file) is False: print(datetime.now().strftime(time_format) + 'specified depth file not found, program exited!') exit() print(datetime.now().strftime(time_format) + 'Running KEGG annotation for 1 file with %s cores' % (num_threads)) input_file_path, input_file_basename, input_file_ext = sep_path_basename_ext( input_file_folder) parse_blast_op_worker([ input_file_folder, run_blast, As_description_dict, Bs_description_dict, Cs_description_dict, Ds_description_dict, D2ABCD_dict, db_seq_to_KO_dict, input_file_path, depth_file, pct_by_all ]) if os.path.isdir(input_file_folder) is True: input_file_re = '%s/*.%s' % (input_file_folder, file_extension) input_file_name_list = [ os.path.basename(file_name) for file_name in glob.glob(input_file_re) ] # check whether depth file exist if depth_file is not None: if os.path.isfile(depth_file) is True: print( datetime.now().strftime(time_format) + 'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.' ) print( datetime.now().strftime(time_format) + 'single depth file (not folder) detected, program exited!') exit() if os.path.isdir(depth_file) is False: print(datetime.now().strftime(time_format) + 'specified depth folder not found, program exited!') exit() if os.path.isdir(depth_file) is True: undetected_depth_file = [] for input_seq_file in input_file_name_list: input_seq_file_basename = '.'.join( input_seq_file.split('.')[:-1]) input_seq_file_depth = '%s/%s.depth' % ( depth_file, input_seq_file_basename) if os.path.isfile(input_seq_file_depth) is False: undetected_depth_file.append(input_seq_file_depth) if len(undetected_depth_file) > 0: print( datetime.now().strftime(time_format) + 'the following depth files not found, program exited!') print(','.join(undetected_depth_file)) exit() # create output folder output_folder = '%s_KEGG_wd' % input_file_folder input_folder_name = input_file_folder if '/' in input_file_folder: input_folder_name = input_file_folder.split('/')[-1] # parse blast results with multiprocessing if run_blast is True: print( datetime.now().strftime(time_format) + 'Parsing Blast/Diamond results for %s input files with %s cores' % (len(input_file_name_list), num_threads)) list_for_multiple_arguments_parse_blast_op = [] for input_file in input_file_name_list: input_file_basename = '.'.join(input_file.split('.')[:-1]) pwd_input_file = '%s/%s' % (input_file_folder, input_file) # get path to current depth file if depth_file is None: input_file_depth = None else: input_file_depth = '%s/%s.depth' % (depth_file, input_file_basename) list_for_multiple_arguments_parse_blast_op.append([ pwd_input_file, run_blast, As_description_dict, Bs_description_dict, Cs_description_dict, Ds_description_dict, D2ABCD_dict, db_seq_to_KO_dict, output_folder, input_file_depth, pct_by_all ]) # parse blast results with multiprocessing pool = mp.Pool(processes=num_threads) pool.map(parse_blast_op_worker, list_for_multiple_arguments_parse_blast_op) pool.close() pool.join() ######################################################### get dataframe ######################################################### print(datetime.now().strftime(time_format) + 'Data matrix exported to:') for ko_level in ['A', 'B', 'C', 'D']: annotation_df_GeneNumber = '%s/%s_%s_GeneNumber.txt' % ( output_folder, input_folder_name, ko_level) annotation_df_GeneNumber_pct = '%s/%s_%s_GeneNumber_pct.txt' % ( output_folder, input_folder_name, ko_level) annotation_df_GeneNumber_pct_by_all = '%s/%s_%s_GeneNumber_pct_by_all.txt' % ( output_folder, input_folder_name, ko_level) annotation_df_TotalDepth = '%s/%s_%s_TotalDepth.txt' % ( output_folder, input_folder_name, ko_level) annotation_df_TotalDepth_pct = '%s/%s_%s_TotalDepth_pct.txt' % ( output_folder, input_folder_name, ko_level) annotation_df_TotalDepth_pct_by_all = '%s/%s_%s_TotalDepth_pct_by_all.txt' % ( output_folder, input_folder_name, ko_level) #################### get GeneNumber df and report #################### get_KEGG_annot_df(output_folder, ko_level, annotation_df_GeneNumber, annotation_df_GeneNumber_pct, annotation_df_GeneNumber_pct_by_all, with_depth=False, pct_by_all=pct_by_all) print(annotation_df_GeneNumber.split('/')[-1]) print(annotation_df_GeneNumber_pct.split('/')[-1]) if pct_by_all is True: print(annotation_df_GeneNumber_pct_by_all.split('/')[-1]) #################### get TotalDepth df and report #################### if depth_file is not None: get_KEGG_annot_df(output_folder, ko_level, annotation_df_TotalDepth, annotation_df_TotalDepth_pct, annotation_df_TotalDepth_pct_by_all, with_depth=True, pct_by_all=pct_by_all) print(annotation_df_TotalDepth.split('/')[-1]) print(annotation_df_TotalDepth_pct.split('/')[-1]) if pct_by_all is True: print(annotation_df_TotalDepth_pct_by_all.split('/')[-1]) ################################################## Final report #################################################### print(datetime.now().strftime(time_format) + 'Done!')
def get_ref_to_read_num_from_sam(input_sam_file, output_stats_file): output_stats_file_path, output_stats_file_basename, output_stats_file_extension = sep_path_basename_ext( output_stats_file) output_stats_tmp = '%s/%s_tmp%s' % (output_stats_file_path, output_stats_file_basename, output_stats_file_extension) # Store reads num in dict ref2read_num_dict = {} for each_read in open(input_sam_file): if not each_read.startswith('@'): ref_id = each_read.strip().split('\t')[2] if ref_id not in ref2read_num_dict: ref2read_num_dict[ref_id] = 1 else: ref2read_num_dict[ref_id] += 1 # Write reads num to file stat_file_unsorted_handle = open(output_stats_tmp, 'w') for each_ref in ref2read_num_dict: stat_file_unsorted_handle.write( '%s\t%s\n' % (each_ref, ref2read_num_dict[each_ref])) stat_file_unsorted_handle.close() # sort output file os.system('cat %s | sort > %s' % (output_stats_tmp, output_stats_file)) # remove tmp file os.system('rm %s' % output_stats_tmp)
def get_bin_abundance(args): ################################################# read in arguments ################################################ sam_file = args['sam'] bin_folder = args['bin'] bin_ext = args['x'] output_file = args['o'] cluster_info = args['g'] dRep_Cdb_file = args['Cdb'] ############################################## define bin_cluster file ############################################# bin_cluster_file = '' if (cluster_info is None) and (dRep_Cdb_file is None): bin_cluster_file = None elif (cluster_info is not None) and (dRep_Cdb_file is None): bin_cluster_file = cluster_info elif (cluster_info is None) and (dRep_Cdb_file is not None): Cdb_file_path, Cdb_file_basename, Cdb_file_extension = sep_path_basename_ext( dRep_Cdb_file) cluster_file_from_Cdb = '%s/%s_derived_cluster_file_%s%s' % ( Cdb_file_path, Cdb_file_basename, datetime.now().strftime('%Y-%m-%d_%Hh-%Mm-%Ss_%f'), Cdb_file_extension) Cdb_2_bin_cluster_file(dRep_Cdb_file, cluster_file_from_Cdb) bin_cluster_file = cluster_file_from_Cdb else: print( datetime.now().strftime(time_format) + 'cluster_info and dRep_Cdb are not compatible, please specify one only, program exited!' ) exit() ################################################ get bin to ctg dict ############################################### bin_file_re = '%s/*%s' % (bin_folder, bin_ext) bin_file_list = [ os.path.basename(file_name) for file_name in glob.glob(bin_file_re) ] if len(bin_file_list) == 0: print(datetime.now().strftime(time_format) + 'No bin file found, program exited!') exit() bin_2_ctg_dict = {} for each_bin in bin_file_list: pwd_each_bin = '%s/%s' % (bin_folder, each_bin) bin_2_ctg_dict[each_bin] = set() for seq in SeqIO.parse(pwd_each_bin, 'fasta'): bin_2_ctg_dict[each_bin].add(seq.id) ############################################ get group to ctg list dict ############################################ print(datetime.now().strftime(time_format) + 'Get bin (cluster) to contig correlations') if bin_cluster_file is None: group_2_ctg_dict = bin_2_ctg_dict else: # get group to bin dict group_2_bin_dict = {} for group in open(bin_cluster_file): group_split = group.strip().split('\t') group_2_bin_dict[group_split[0]] = group_split[1:] group_2_ctg_dict = {} for bin_group in group_2_bin_dict: group_member_list = group_2_bin_dict[bin_group] group_2_ctg_dict[bin_group] = set() for genome_bin in group_member_list: genome_bin_ctg_list = bin_2_ctg_dict[genome_bin] for ctg in genome_bin_ctg_list: group_2_ctg_dict[bin_group].add(ctg) ########################################### get_ref_to_read_num_from_sam ########################################### print( datetime.now().strftime(time_format) + 'Get the number of reads mapped to each reference sequence in sam file' ) sam_file_path, sam_file_basename, sam_file_extension = sep_path_basename_ext( sam_file) ref_to_read_num_file = '%s/%s_ref_to_read_num_%s.txt' % ( sam_file_path, sam_file_basename, datetime.now().strftime('%Y-%m-%d_%Hh-%Mm-%Ss_%f')) get_ref_to_read_num_from_sam(sam_file, ref_to_read_num_file) ########################################### read in ref_to_read_num_file ########################################### mapped_reads_num = 0 ref_to_read_num_dict = {} for each_ctg in open(ref_to_read_num_file): each_ctg_split = each_ctg.strip().split('\t') ctg_id = each_ctg_split[0] read_num = int(each_ctg_split[1]) ref_to_read_num_dict[ctg_id] = read_num if ctg_id != '*': mapped_reads_num += read_num ###################################### get the number of reads in each group ####################################### print(datetime.now().strftime(time_format) + 'Get the number of reads mapped to each bin (cluster)') group_to_read_num_dict = {} for group in group_2_ctg_dict: group_ctg_list = group_2_ctg_dict[group] group_to_read_num_dict[group] = 0 for ctg in group_ctg_list: group_to_read_num_dict[group] += ref_to_read_num_dict.get(ctg, 0) output_file_handle = open(output_file, 'w') output_file_handle.write('cluster\tread_num\tread_pct\n') for group in group_to_read_num_dict: group_read_num = group_to_read_num_dict[group] group_read_pct = float("{0:.2f}".format(group_read_num * 100 / mapped_reads_num)) output_file_handle.write('%s\t%s\t%s\n' % (group, group_read_num, group_read_pct)) output_file_handle.close() ################################################## final report #################################################### # delete tmp files os.system('rm %s' % ref_to_read_num_file) if (cluster_info is None) and (dRep_Cdb_file is not None): os.system('rm %s' % bin_cluster_file) # final report print(datetime.now().strftime(time_format) + 'Done!')
def COG2020(args): file_in = args['i'] file_extension = args['x'] sequence_type = args['m'] depth_file = args['depth'] pct_by_all = args['pct_by_all'] DB_dir = args['db_dir'] num_threads = args['t'] run_diamond = args['diamond'] evalue_cutoff = args['evalue'] pwd_cog_20_fa = '%s/cog-20.fa' % DB_dir pwd_cog_20_fa_diamond = '%s/cog-20.fa.dmnd' % DB_dir pwd_cog_20_cog_csv = '%s/cog-20.cog.csv' % DB_dir pwd_cog_20_def_tab = '%s/cog-20.def.tab' % DB_dir pwd_fun_20_tab = '%s/fun-20.tab' % DB_dir ############################################ check whether db file exist ########################################### # check whether db file exist unfound_inputs = [] for each_input in [pwd_cog_20_fa, pwd_cog_20_def_tab, pwd_fun_20_tab]: if (not os.path.isfile(each_input)) and ( not os.path.isdir(each_input)): unfound_inputs.append(each_input) if len(unfound_inputs) > 0: for each_unfound in unfound_inputs: print('%s not found' % each_unfound) exit() if run_diamond is True: if os.path.isfile(pwd_cog_20_fa_diamond) is False: print( datetime.now().strftime(time_format) + 'DB file for diamond not found, please refers to the help info for diamond db preparation' ) print(datetime.now().strftime(time_format) + 'Program exited!') exit() ################################################# read db into dict ################################################ # get protein_to_cog_dict (cog-20.cog.csv) protein_to_cog_dict = {} for each_line in open(pwd_cog_20_cog_csv): each_line_split = each_line.strip().split(',') protein_id = each_line_split[2] protein_id_no_dot = '_'.join(protein_id.split('.')) cog_id = each_line_split[6] if protein_id_no_dot not in protein_to_cog_dict: protein_to_cog_dict[protein_id_no_dot] = {cog_id} else: protein_to_cog_dict[protein_id_no_dot].add(cog_id) # get cog_id_to_category_dict and cog_id_to_description_dict (cognames2003-2014.tab) cog_id_to_category_dict = {} cog_id_to_description_dict = {} for cog_id_to_cate_des in open(pwd_cog_20_def_tab, encoding='windows-1252'): if not cog_id_to_cate_des.startswith('#'): cog_id_to_cate_des_split = cog_id_to_cate_des.strip().split('\t') cog_id = cog_id_to_cate_des_split[0] cog_cate = cog_id_to_cate_des_split[1] cog_des = cog_id_to_cate_des_split[2] cog_id_to_category_dict[cog_id] = cog_cate cog_id_to_description_dict[cog_id] = cog_des # get cog_category_to_description_dict (fun2003-2014.tab) cog_category_list = [] cog_category_to_description_dict = {} for cog_category in open(pwd_fun_20_tab): if not cog_category.startswith('#'): cog_category_split = cog_category.strip().split('\t') cog_category_list.append(cog_category_split[0]) cog_category_to_description_dict[ cog_category_split[0]] = cog_category_split[1] ################################################## if input is file ################################################ # if input is file if os.path.isfile(file_in) is True: # check whether depth file exist if depth_file is not None: if os.path.isfile(depth_file) is False: print(datetime.now().strftime(time_format) + 'specified depth file not found, program exited!') exit() print(datetime.now().strftime(time_format) + 'Running COG annotation for 1 file with %s cores' % (num_threads)) file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext( file_in) COG2020_worker([ file_in, pwd_cog_20_fa, protein_to_cog_dict, cog_id_to_category_dict, cog_id_to_description_dict, cog_category_list, cog_category_to_description_dict, sequence_type, file_in_path, num_threads, run_diamond, evalue_cutoff, depth_file, pct_by_all ]) ################################################ if input is folder ################################################ # if input is folder else: # check whether input folder exist if os.path.isdir(file_in) is False: print(datetime.now().strftime(time_format) + 'input folder not found, program exited') exit() else: # check whether input genome exist input_file_re = '%s/*.%s' % (file_in, file_extension) input_file_name_list = [ os.path.basename(file_name) for file_name in glob.glob(input_file_re) ] if len(input_file_name_list) == 0: print(datetime.now().strftime(time_format) + 'input file not found, program exited') exit() # check whether depth file exist if depth_file is not None: if os.path.isfile(depth_file) is True: print( datetime.now().strftime(time_format) + 'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.' ) print( datetime.now().strftime(time_format) + 'a single file (not folder) detected, program exited!') exit() if os.path.isdir(depth_file) is False: print(datetime.now().strftime(time_format) + 'provided depth folder not found, program exited!') exit() if os.path.isdir(depth_file) is True: undetected_depth_file = [] for input_seq_file in input_file_name_list: input_seq_file_basename = '.'.join( input_seq_file.split('.')[:-1]) input_seq_file_depth = '%s/%s.depth' % ( depth_file, input_seq_file_basename) if os.path.isfile(input_seq_file_depth) is False: undetected_depth_file.append(input_seq_file_depth) if len(undetected_depth_file) > 0: print( datetime.now().strftime(time_format) + 'the following depth files not found, program exited!' ) print(','.join(undetected_depth_file)) exit() ################################################### define file name ################################################### if '/' in file_in: file_in_folder_name = file_in.split('/')[-1] else: file_in_folder_name = file_in output_folder = '%s_COG2020_wd' % file_in_folder_name # create output folder force_create_folder(output_folder) ######################################################### main ######################################################### print(datetime.now().strftime(time_format) + 'Running COG annotation for %s files with %s cores' % (len(input_file_name_list), num_threads)) list_for_multiple_arguments_COG = [] for input_file in input_file_name_list: input_file_basename = '.'.join(input_file.split('.')[:-1]) pwd_input_file = '%s/%s' % (file_in, input_file) # get path to current depth file if depth_file is None: input_file_depth = None else: input_file_depth = '%s/%s.depth' % (depth_file, input_file_basename) list_for_multiple_arguments_COG.append([ pwd_input_file, pwd_cog_20_fa, protein_to_cog_dict, cog_id_to_category_dict, cog_id_to_description_dict, cog_category_list, cog_category_to_description_dict, sequence_type, output_folder, 1, run_diamond, evalue_cutoff, input_file_depth, pct_by_all ]) # run COG annotaion files with multiprocessing pool = mp.Pool(processes=num_threads) pool.map(COG2020_worker, list_for_multiple_arguments_COG) pool.close() pool.join() ######################################################### get dataframe ######################################################### annotation_df_cog_cate_GeneNumber = '%s/%s_COG2020_cate_GeneNumber.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_GeneNumber_pct = '%s/%s_COG2020_cate_GeneNumber_pct.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_GeneNumber_pct_by_all = '%s/%s_COG2020_cate_GeneNumber_pct_by_all.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_TotalDepth = '%s/%s_COG2020_cate_TotalDepth.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_TotalDepth_pct = '%s/%s_COG2020_cate_TotalDepth_pct.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_TotalDepth_pct_by_all = '%s/%s_COG2020_cate_TotalDepth_pct_by_all.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_GeneNumber = '%s/%s_COG2020_id_GeneNumber.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_GeneNumber_pct = '%s/%s_COG2020_id_GeneNumber_pct.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_GeneNumber_pct_by_all = '%s/%s_COG2020_id_GeneNumber_pct_by_all.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_TotalDepth = '%s/%s_COG2020_id_TotalDepth.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_TotalDepth_pct = '%s/%s_COG2020_id_TotalDepth_pct.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_TotalDepth_pct_by_all = '%s/%s_COG2020_id_TotalDepth_pct_by_all.txt' % ( output_folder, file_in_folder_name) print(datetime.now().strftime(time_format) + 'Data matrix exported to:') # get df get_COG_annot_df(output_folder, 'cog_cate', annotation_df_cog_cate_GeneNumber, annotation_df_cog_cate_GeneNumber_pct, annotation_df_cog_cate_GeneNumber_pct_by_all, with_depth=False, pct_by_all=False) get_COG_annot_df(output_folder, 'cog_id', annotation_df_cog_id_GeneNumber, annotation_df_cog_id_GeneNumber_pct, annotation_df_cog_id_GeneNumber_pct_by_all, with_depth=False, pct_by_all=False) if pct_by_all is True: get_COG_annot_df(output_folder, 'cog_cate', annotation_df_cog_cate_GeneNumber, annotation_df_cog_cate_GeneNumber_pct, annotation_df_cog_cate_GeneNumber_pct_by_all, with_depth=False, pct_by_all=True) get_COG_annot_df(output_folder, 'cog_id', annotation_df_cog_id_GeneNumber, annotation_df_cog_id_GeneNumber_pct, annotation_df_cog_id_GeneNumber_pct_by_all, with_depth=False, pct_by_all=True) # report if pct_by_all is False: print(datetime.now().strftime(time_format) + '%s and %s' % (annotation_df_cog_id_GeneNumber.split('/')[-1], annotation_df_cog_id_GeneNumber_pct.split('/')[-1])) print(datetime.now().strftime(time_format) + '%s and %s' % (annotation_df_cog_cate_GeneNumber.split('/')[-1], annotation_df_cog_cate_GeneNumber_pct.split('/')[-1])) else: print( datetime.now().strftime(time_format) + '%s, %s and %s' % (annotation_df_cog_id_GeneNumber.split('/')[-1], annotation_df_cog_id_GeneNumber_pct.split('/')[-1], annotation_df_cog_id_GeneNumber_pct_by_all.split('/')[-1]) ) print(datetime.now().strftime(time_format) + '%s, %s and %s' % (annotation_df_cog_cate_GeneNumber.split('/')[-1], annotation_df_cog_cate_GeneNumber_pct.split('/')[-1], annotation_df_cog_cate_GeneNumber_pct_by_all.split('/') [-1])) if depth_file is not None: get_COG_annot_df(output_folder, 'cog_cate', annotation_df_cog_cate_TotalDepth, annotation_df_cog_cate_TotalDepth_pct, annotation_df_cog_cate_TotalDepth_pct_by_all, with_depth=True, pct_by_all=False) get_COG_annot_df(output_folder, 'cog_id', annotation_df_cog_id_TotalDepth, annotation_df_cog_id_TotalDepth_pct, annotation_df_cog_id_TotalDepth_pct_by_all, with_depth=True, pct_by_all=False) if pct_by_all is True: get_COG_annot_df( output_folder, 'cog_cate', annotation_df_cog_cate_TotalDepth, annotation_df_cog_cate_TotalDepth_pct, annotation_df_cog_cate_TotalDepth_pct_by_all, with_depth=True, pct_by_all=True) get_COG_annot_df( output_folder, 'cog_id', annotation_df_cog_id_TotalDepth, annotation_df_cog_id_TotalDepth_pct, annotation_df_cog_id_TotalDepth_pct_by_all, with_depth=True, pct_by_all=True) # report if pct_by_all is False: print(datetime.now().strftime(time_format) + '%s and %s' % (annotation_df_cog_id_TotalDepth.split('/')[-1], annotation_df_cog_id_TotalDepth_pct.split('/')[-1])) print( datetime.now().strftime(time_format) + '%s and %s' % (annotation_df_cog_cate_TotalDepth.split('/')[-1], annotation_df_cog_cate_TotalDepth_pct.split('/')[-1])) else: print(datetime.now().strftime(time_format) + '%s, %s and %s' % (annotation_df_cog_id_TotalDepth.split('/')[-1], annotation_df_cog_id_TotalDepth_pct.split('/')[-1], annotation_df_cog_id_TotalDepth_pct_by_all.split( '/')[-1])) print( datetime.now().strftime(time_format) + '%s, %s and %s' % (annotation_df_cog_cate_TotalDepth.split('/')[-1], annotation_df_cog_cate_TotalDepth_pct.split('/')[-1], annotation_df_cog_cate_TotalDepth_pct_by_all.split( '/')[-1])) ################################################## Final report #################################################### print(datetime.now().strftime(time_format) + 'Done!')
def SILVA_for_BLCA(args): SILVA_db_file = args['SILVA_ssu'] SILVA_db_file_path, SILVA_db_file_basename, SILVA_db_file_ext = sep_path_basename_ext( SILVA_db_file) file_out_sequence = '%s/%s_BLCAparsed.fasta' % (SILVA_db_file_path, SILVA_db_file_basename) file_out_taxonomy = '%s/%s_BLCAparsed.taxonomy' % (SILVA_db_file_path, SILVA_db_file_basename) rank_list = [ 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' ] file_out_sequence_handle = open(file_out_sequence, 'w') file_out_taxonomy_handle = open(file_out_taxonomy, 'w') for SILVA_seq in SeqIO.parse(SILVA_db_file, 'fasta'): SILVA_seq_taxon = ' '.join(SILVA_seq.description.split(' ')[1:]) if not SILVA_seq_taxon.startswith('Eukaryota'): SILVA_seq_taxon_split = SILVA_seq_taxon.split(';') if len(SILVA_seq_taxon_split) < 7: if len(SILVA_seq_taxon_split) == 6: SILVA_seq_taxon_split.append('unidentified') if len(SILVA_seq_taxon_split) == 5: SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') if len(SILVA_seq_taxon_split) == 4: SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') if len(SILVA_seq_taxon_split) == 3: SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') if len(SILVA_seq_taxon_split) == 2: SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') SILVA_seq_taxon_split.append('unidentified') if len(SILVA_seq_taxon_split) > 7: SILVA_seq_taxon_split = [ SILVA_seq_taxon_split[0], SILVA_seq_taxon_split[1], SILVA_seq_taxon_split[2], SILVA_seq_taxon_split[3], SILVA_seq_taxon_split[4], SILVA_seq_taxon_split[5], ' '.join(SILVA_seq_taxon_split[6:]) ] SILVA_seq_taxon_split_reverse = SILVA_seq_taxon_split[::-1] SILVA_seq_taxon_str = '' n = 0 for taxon_rank in rank_list: SILVA_seq_taxon_str += '%s:%s;' % ( taxon_rank, SILVA_seq_taxon_split_reverse[n]) n += 1 # write out to taxonomy file file_out_taxonomy_handle.write('%s\t%s\n' % (SILVA_seq.id, SILVA_seq_taxon_str)) # write out to sequence file file_out_sequence_handle.write('>%s\n' % SILVA_seq.id) file_out_sequence_handle.write('%s\n' % SILVA_seq.seq) file_out_sequence_handle.close() file_out_taxonomy_handle.close()
def get_gene_depth(args): gbk_file = args['gbk'] gff_file = args['gff'] ctg_depth_file = args['ctg_depth'] id_column = args['id_column'] depth_column = args['depth_column'] skip_depth_file_header = args['skip_header'] ################################################# check input file ################################################# annotation_file = None if (gbk_file is None) and (gff_file is None): print( datetime.now().strftime(time_format) + 'Please provide either a gbk file or a gff file, program exited!') exit() if (gbk_file is not None) and (gff_file is not None): print( datetime.now().strftime(time_format) + 'Both gbk and gff file were provided, will parse annotation results from gff file' ) annotation_file = gff_file if (gbk_file is not None) and (gff_file is None): annotation_file = gbk_file if (gbk_file is None) and (gff_file is not None): annotation_file = gff_file # define output gene deptp file name annotation_file_path, annotation_file_basename, annotation_file_extension = sep_path_basename_ext( annotation_file) pwd_gene_depth_file = '%s/%s.depth' % (annotation_file_path, annotation_file_basename) if os.path.isfile(pwd_gene_depth_file) is True: pwd_gene_depth_file = '%s/%s.depth.txt' % (annotation_file_path, annotation_file_basename) ################################################ read in ctg depth ################################################# ctg_depth_dict = {} line = 0 for ctg in open(ctg_depth_file): ctg_split = ctg.strip().split('\t') if skip_depth_file_header is True: if line > 0: ctg_depth_dict[ctg_split[id_column - 1]] = float( ctg_split[depth_column - 1]) else: ctg_depth_dict[ctg_split[id_column - 1]] = float( ctg_split[depth_column - 1]) line += 1 ########################################### get gene depth with gbk file ########################################### if annotation_file == gbk_file: gene_depth_file_handle = open(pwd_gene_depth_file, 'w') for seq_record in SeqIO.parse(gbk_file, 'genbank'): seq_id = seq_record.id seq_depth = ctg_depth_dict[seq_id] for feature in seq_record.features: if (feature.type != 'source') and (feature.type != 'assembly_gap'): gene_id = feature.qualifiers['locus_tag'][0] for_out = '%s\t%s\n' % (gene_id, seq_depth) gene_depth_file_handle.write(for_out) gene_depth_file_handle.close() ########################################### get gene depth with gff file ########################################### if annotation_file == gff_file: gene_depth_file_handle = open(pwd_gene_depth_file, 'w') for each_line in open(gff_file): if not each_line.startswith('#'): each_line_split = each_line.strip().split('\t') if len(each_line_split) > 1: seq_id = each_line_split[0] seq_depth = ctg_depth_dict[seq_id] gene_id = each_line_split[8].split(';')[0].split('ID=')[-1] for_out = '%s\t%s\n' % (gene_id, seq_depth) gene_depth_file_handle.write(for_out) gene_depth_file_handle.close() ###################################################### report ###################################################### print(datetime.now().strftime(time_format) + 'Gene depth exported to %s' % pwd_gene_depth_file) print(datetime.now().strftime(time_format) + 'Done!')
def NetEnzymes(args, config_dict): enzymes_list_file = args['enzymes'] compound_list_file = args['compounds'] interested_ko_id = args['ko'] ignore_ec_with_hyphen = args['NoHyphen'] to_skip_file = args['to_skip'] plot_network = args['plot'] label_font_size = args['lfs'] node_size = args['ns'] ko00001_keg = config_dict['ko00001_keg'] db_file_with_ec = config_dict['MetaCyc_rxns_with_ec'] ######################################################################################################################## node_color_dict = { 'enzyme': 'lightgreen', 'substrate': 'grey', 'product': 'grey' } skip_list = set() if to_skip_file is not None: for each_to_skip in open(to_skip_file): skip_list.add(each_to_skip.strip()) compounds_to_include_list = 'all' if compound_list_file is not None: compounds_to_include_list = set() for compound_to_include in open(compound_list_file): compounds_to_include_list.add(compound_to_include.strip()) # define output file name ec_file_no_path, ec_file_no_ext, ec_file_ext = sep_path_basename_ext( enzymes_list_file) if interested_ko_id is None: if ignore_ec_with_hyphen is True: output_graphml = '%s/%s_NoHyphen.graphml' % (ec_file_no_path, ec_file_no_ext) output_plot = '%s/%s_NoHyphen.png' % (ec_file_no_path, ec_file_no_ext) else: output_graphml = '%s/%s.graphml' % (ec_file_no_path, ec_file_no_ext) output_plot = '%s/%s.png' % (ec_file_no_path, ec_file_no_ext) else: if ignore_ec_with_hyphen is True: output_graphml = '%s/%s_ko%s_NoHyphen.graphml' % ( ec_file_no_path, ec_file_no_ext, interested_ko_id) output_plot = '%s/%s_ko%s_NoHyphen.png' % ( ec_file_no_path, ec_file_no_ext, interested_ko_id) else: output_graphml = '%s/%s_ko%s.graphml' % ( ec_file_no_path, ec_file_no_ext, interested_ko_id) output_plot = '%s/%s_ko%s.png' % (ec_file_no_path, ec_file_no_ext, interested_ko_id) ######################################################################################################################## interested_ec_list = [] if interested_ko_id is not None: print(datetime.now().strftime(time_format) + 'get ECs from interested KO') # read in KEGG db file KO_description_A_dict, KO_description_B_dict, KO_description_C_dict, KO_description_D_dict, D2ABCD_dict, ko2level_dict = get_ko2description_dict( ko00001_keg) # get ec list from interested KO category interested_ec_list = get_ec_of_interested_ko( D2ABCD_dict, KO_description_D_dict, ko2level_dict[interested_ko_id], interested_ko_id) # get identified_ec_list print(datetime.now().strftime(time_format) + 'read in provided ECs') identified_ec_list = set() for ec in open(enzymes_list_file): ec = ec.strip() if interested_ko_id is not None: if ec in interested_ec_list: if ignore_ec_with_hyphen is False: identified_ec_list.add(ec) else: if '-' not in ec: identified_ec_list.add(ec) else: if ignore_ec_with_hyphen is False: identified_ec_list.add(ec) else: if '-' not in ec: identified_ec_list.add(ec) # initialize a graph G = nx.DiGraph() print(datetime.now().strftime(time_format) + 'add nodes and edges to network') # add node and edge for reaction in open(db_file_with_ec): ec_id = reaction.strip().split('\t')[0] if ec_id in identified_ec_list: parse_biological_raction(G, reaction, skip_list, compounds_to_include_list, node_color_dict) print(datetime.now().strftime(time_format) + 'write out network to graphml file') # write out graphml nx.write_graphml(G, output_graphml) if plot_network is True: print(datetime.now().strftime(time_format) + 'plot network') # specify graph_layout = nx.layout.kamada_kawai_layout( G ) # kamada_kawai_layout, planar_layout, fruchterman_reingold_layout # turn node attributes into dict node_attributes_dict = {} for node in G.nodes(data=True): node_attributes_dict[node[0]] = node[1] print(datetime.now().strftime(time_format) + 'plot nodes') # plot node for node in G: nx.draw_networkx_nodes( G, graph_layout, nodelist=[node], node_size=node_size, node_color=node_attributes_dict[node]['color_map'], node_shape=node_attributes_dict[node]['shape']) # add customized node label # nx.draw_networkx_labels(g, graph_layout, nodelist=[node], font_size=8, font_color='black') # all nodes label together nx.draw_networkx_labels(G, graph_layout, nodelist=G.nodes, font_size=label_font_size, font_color='black') print(datetime.now().strftime(time_format) + 'plot edges') # plot edges nx.draw_networkx_edges(G, graph_layout, width=0.5, arrows=True, arrowsize=6) # save plot plt.savefig(output_plot, dpi=300) plt.close() ######################################################################################################################## # G_in_cytoscape_data = json_graph.cytoscape_data(G) # print(G_in_cytoscape_data) # G_in_cytoscape_graph = json_graph.cytoscape_graph(G_in_cytoscape_data) # print(G_in_cytoscape_data) print(datetime.now().strftime(time_format) + 'Done!')
def dbCAN(args): file_in = args['i'] file_extension = args['x'] sequence_type = args['m'] depth_file = args['depth'] DB_dir = args['db_dir'] num_threads = args['t'] pwd_hmmscan_parser = '%s/hmmscan-parser.sh' % DB_dir pwd_dbCAN_fam_HMMs = '%s/dbCAN-fam-HMMs.txt' % DB_dir CAZyDB_fam_activities = '%s/CAZyDB.fam-activities.txt' % DB_dir CAZyDB_fam_activities_07312019 = '%s/CAZyDB.07312019.fam-activities.txt' % DB_dir if (os.path.isfile(CAZyDB_fam_activities_07312019) is True) and (os.path.isfile(CAZyDB_fam_activities) is False): os.system('mv %s %s' % (CAZyDB_fam_activities_07312019, CAZyDB_fam_activities)) ############################################ check whether db file exist ########################################### # check whether db file exist unfound_inputs = [] for each_input in [pwd_hmmscan_parser, pwd_dbCAN_fam_HMMs]: if (not os.path.isfile(each_input)) and ( not os.path.isdir(each_input)): unfound_inputs.append(each_input) if len(unfound_inputs) > 0: for each_unfound in unfound_inputs: print('%s not found' % each_unfound) exit() # store CAZyDB.fam-activities.txt in dict fam_to_activities_dict = {} for each_fam in open(CAZyDB_fam_activities): each_fam_split = each_fam.strip().split(' ') if len(each_fam_split) == 2: fam_id = each_fam_split[0] fam_activities = each_fam_split[1] fam_to_activities_dict[fam_id] = fam_activities ################################################## if input is file ################################################ # if input is file if os.path.isfile(file_in) is True: # check whether depth file exist if depth_file is not None: if os.path.isfile(depth_file) is False: print(datetime.now().strftime(time_format) + 'specified depth file not found, program exited!') exit() print(datetime.now().strftime(time_format) + 'Running dbCAN for 1 file with %s cores' % (num_threads)) file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext( file_in) dbCAN_worker([ file_in, pwd_hmmscan_parser, pwd_dbCAN_fam_HMMs, sequence_type, file_in_path, fam_to_activities_dict, depth_file ]) ################################################ if input is folder ################################################ # if input is folder else: # check whether input folder exist if os.path.isdir(file_in) is False: print(datetime.now().strftime(time_format) + 'input folder not found, program exited!') exit() else: # check whether input genome exist input_file_re = '%s/*.%s' % (file_in, file_extension) input_file_name_list = [ os.path.basename(file_name) for file_name in glob.glob(input_file_re) ] if len(input_file_name_list) == 0: print(datetime.now().strftime(time_format) + 'input file not found, program exited!') exit() # check whether depth file exist if depth_file is not None: if os.path.isfile(depth_file) is True: print( datetime.now().strftime(time_format) + 'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.' ) print( datetime.now().strftime(time_format) + 'a single file (not folder) detected, program exited!') exit() if os.path.isdir(depth_file) is False: print(datetime.now().strftime(time_format) + 'provided depth folder not found, program exited!') exit() if os.path.isdir(depth_file) is True: undetected_depth_file = [] for input_seq_file in input_file_name_list: input_seq_file_basename = '.'.join( input_seq_file.split('.')[:-1]) input_seq_file_depth = '%s/%s.depth' % ( depth_file, input_seq_file_basename) if os.path.isfile(input_seq_file_depth) is False: undetected_depth_file.append(input_seq_file_depth) if len(undetected_depth_file) > 0: print( datetime.now().strftime(time_format) + 'the following depth files not found, program exited!' ) print(','.join(undetected_depth_file)) exit() ################################################### define file name ################################################### if '/' in file_in: file_in_folder_name = file_in.split('/')[-1] else: file_in_folder_name = file_in output_folder = '%s_dbCAN_wd' % file_in_folder_name annotation_df_GeneNumber = '%s/%s_GeneNumber.txt' % ( output_folder, file_in_folder_name) annotation_df_GeneNumber_pct = '%s/%s_GeneNumber_pct.txt' % ( output_folder, file_in_folder_name) annotation_df_TotalDepth = '%s/%s_TotalDepth.txt' % ( output_folder, file_in_folder_name) annotation_df_TotalDepth_pct = '%s/%s_TotalDepth_pct.txt' % ( output_folder, file_in_folder_name) # create output folder force_create_folder(output_folder) ######################################################### main ######################################################### print(datetime.now().strftime(time_format) + 'Running dbCAN for %s input files with %s cores' % (len(input_file_name_list), num_threads)) list_for_multiple_arguments_dbCAN = [] for input_file in input_file_name_list: input_file_basename = '.'.join(input_file.split('.')[:-1]) pwd_input_file = '%s/%s' % (file_in, input_file) # get path to current depth file if depth_file is None: input_file_depth = None else: input_file_depth = '%s/%s.depth' % (depth_file, input_file_basename) list_for_multiple_arguments_dbCAN.append([ pwd_input_file, pwd_hmmscan_parser, pwd_dbCAN_fam_HMMs, sequence_type, output_folder, fam_to_activities_dict, input_file_depth ]) # run COG annotaion files with multiprocessing pool = mp.Pool(processes=num_threads) pool.map(dbCAN_worker, list_for_multiple_arguments_dbCAN) pool.close() pool.join() ######################################################### get dataframe ######################################################### get_dbCAN_annot_df(output_folder, annotation_df_GeneNumber, annotation_df_GeneNumber_pct, with_depth=False) if depth_file is not None: get_dbCAN_annot_df(output_folder, annotation_df_TotalDepth, annotation_df_TotalDepth_pct, with_depth=True) # report print(datetime.now().strftime(time_format) + 'Data matrix exported to:') print(datetime.now().strftime(time_format) + annotation_df_GeneNumber.split('/')[-1]) print(datetime.now().strftime(time_format) + annotation_df_GeneNumber_pct.split('/')[-1]) if depth_file is not None: print(datetime.now().strftime(time_format) + annotation_df_TotalDepth.split('/')[-1]) print(datetime.now().strftime(time_format) + annotation_df_TotalDepth_pct.split('/')[-1]) print(datetime.now().strftime(time_format) + 'Done!')
def COG2014(args): file_in = args['i'] file_extension = args['x'] sequence_type = args['m'] depth_file = args['depth'] pct_by_all = args['pct_by_all'] DB_dir = args['db_dir'] num_threads = args['t'] run_diamond = args['diamond'] evalue_cutoff = args['evalue'] pwd_prot2003_2014 = '%s/prot2003-2014.fa' % DB_dir pwd_prot2003_2014_diamond = '%s/prot2003-2014.fa.dmnd' % DB_dir pwd_prot2003_2014_tab = '%s/prot2003-2014.tab' % DB_dir pwd_cog2003_2014 = '%s/cog2003-2014.csv' % DB_dir pwd_cognames2003_2014 = '%s/cognames2003-2014.tab' % DB_dir pwd_fun2003_2014 = '%s/fun2003-2014.tab' % DB_dir ############################################ check whether db file exist ########################################### # check whether db file exist unfound_inputs = [] for each_input in [ pwd_prot2003_2014, pwd_cognames2003_2014, pwd_fun2003_2014 ]: if (not os.path.isfile(each_input)) and ( not os.path.isdir(each_input)): unfound_inputs.append(each_input) if len(unfound_inputs) > 0: for each_unfound in unfound_inputs: print('%s not found' % each_unfound) exit() # check whether db index esixt if run_diamond is False: unfound_db_index_file = [] for db_index in [ 'phr', 'pin', 'pnd', 'pni', 'pog', 'psd', 'psi', 'psq' ]: pwd_db_index = '%s/prot2003-2014.fa.%s' % (DB_dir, db_index) if not os.path.isfile(pwd_db_index): unfound_db_index_file.append(db_index) if len(unfound_db_index_file) > 0: print(datetime.now().strftime(time_format) + 'DB index files not found, runing makeblastdb first') makeblastdb_cmd = 'makeblastdb -in %s -dbtype prot -parse_seqids -logfile %s.log' % ( pwd_prot2003_2014, pwd_prot2003_2014) os.system(makeblastdb_cmd) print(datetime.now().strftime(time_format) + 'makeblastdb finished') if run_diamond is True: if os.path.isfile(pwd_prot2003_2014_diamond) is False: print(datetime.now().strftime(time_format) + 'DB file not found, making diamond db') diamond_makedb_cmd = 'diamond makedb --in %s --db %s --quiet' % ( pwd_prot2003_2014, pwd_prot2003_2014_diamond) os.system(diamond_makedb_cmd) ################################################# read db into dict ################################################ # get refseq_acc_to_protein_id_dict (prot2003-2014.tab) refseq_acc_to_protein_id_dict = {} for refseq_acc in open(pwd_prot2003_2014_tab): refseq_acc_split = refseq_acc.strip().split('\t') refseq_acc_to_protein_id_dict[ refseq_acc_split[1]] = refseq_acc_split[0] # get protein_id_to_cog_id_dict (cog2003-2014.csv) protein_id_to_cog_id_dict = {} for protein_to_cog in open(pwd_cog2003_2014): protein_to_cog_split = protein_to_cog.strip().split(',') protein_id = protein_to_cog_split[2] cog_id = protein_to_cog_split[6] protein_id_to_cog_id_dict[protein_id] = cog_id # get cog_id_to_category_dict and cog_id_to_description_dict (cognames2003-2014.tab) cog_id_to_category_dict = {} cog_id_to_description_dict = {} for cog_id_to_cate_des in open(pwd_cognames2003_2014, encoding='windows-1252'): if not cog_id_to_cate_des.startswith('#'): cog_id_to_cate_des_split = cog_id_to_cate_des.strip().split('\t') cog_id = cog_id_to_cate_des_split[0] cog_cate = cog_id_to_cate_des_split[1] cog_des = cog_id_to_cate_des_split[2] cog_id_to_category_dict[cog_id] = cog_cate cog_id_to_description_dict[cog_id] = cog_des # get cog_category_to_description_dict (fun2003-2014.tab) cog_category_list = [] cog_category_to_description_dict = {} for cog_category in open(pwd_fun2003_2014): if not cog_category.startswith('#'): cog_category_split = cog_category.strip().split('\t') cog_category_list.append(cog_category_split[0]) cog_category_to_description_dict[ cog_category_split[0]] = cog_category_split[1] ################################################## if input is file ################################################ # if input is file if os.path.isfile(file_in) is True: # check whether depth file exist if depth_file is not None: if os.path.isfile(depth_file) is False: print(datetime.now().strftime(time_format) + 'specified depth file not found, program exited!') exit() print(datetime.now().strftime(time_format) + 'Running COG annotation for 1 file with %s cores' % (num_threads)) file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext( file_in) COG2014_worker([ file_in, pwd_prot2003_2014, refseq_acc_to_protein_id_dict, protein_id_to_cog_id_dict, cog_id_to_category_dict, cog_id_to_description_dict, cog_category_list, cog_category_to_description_dict, sequence_type, file_in_path, num_threads, run_diamond, evalue_cutoff, depth_file, pct_by_all ]) ################################################ if input is folder ################################################ # if input is folder else: # check whether input folder exist if os.path.isdir(file_in) is False: print(datetime.now().strftime(time_format) + 'input folder not found, program exited') exit() else: # check whether input genome exist input_file_re = '%s/*.%s' % (file_in, file_extension) input_file_name_list = [ os.path.basename(file_name) for file_name in glob.glob(input_file_re) ] if len(input_file_name_list) == 0: print(datetime.now().strftime(time_format) + 'input file not found, program exited') exit() # check whether depth file exist if depth_file is not None: if os.path.isfile(depth_file) is True: print( datetime.now().strftime(time_format) + 'please provide the folder containing individual depth files (with extension .depth) for each of your input sequence file.' ) print( datetime.now().strftime(time_format) + 'a single file (not folder) detected, program exited!') exit() if os.path.isdir(depth_file) is False: print(datetime.now().strftime(time_format) + 'provided depth folder not found, program exited!') exit() if os.path.isdir(depth_file) is True: undetected_depth_file = [] for input_seq_file in input_file_name_list: input_seq_file_basename = '.'.join( input_seq_file.split('.')[:-1]) input_seq_file_depth = '%s/%s.depth' % ( depth_file, input_seq_file_basename) if os.path.isfile(input_seq_file_depth) is False: undetected_depth_file.append(input_seq_file_depth) if len(undetected_depth_file) > 0: print( datetime.now().strftime(time_format) + 'the following depth files not found, program exited!' ) print(','.join(undetected_depth_file)) exit() ################################################### define file name ################################################### if '/' in file_in: file_in_folder_name = file_in.split('/')[-1] else: file_in_folder_name = file_in output_folder = '%s_COG2014_wd' % file_in_folder_name # create output folder force_create_folder(output_folder) ######################################################### main ######################################################### print(datetime.now().strftime(time_format) + 'Running COG annotation for %s files with %s cores' % (len(input_file_name_list), num_threads)) list_for_multiple_arguments_COG = [] for input_file in input_file_name_list: input_file_basename = '.'.join(input_file.split('.')[:-1]) pwd_input_file = '%s/%s' % (file_in, input_file) # get path to current depth file if depth_file is None: input_file_depth = None else: input_file_depth = '%s/%s.depth' % (depth_file, input_file_basename) list_for_multiple_arguments_COG.append([ pwd_input_file, pwd_prot2003_2014, refseq_acc_to_protein_id_dict, protein_id_to_cog_id_dict, cog_id_to_category_dict, cog_id_to_description_dict, cog_category_list, cog_category_to_description_dict, sequence_type, output_folder, 1, run_diamond, evalue_cutoff, input_file_depth, pct_by_all ]) # run COG annotaion files with multiprocessing pool = mp.Pool(processes=num_threads) pool.map(COG2014_worker, list_for_multiple_arguments_COG) pool.close() pool.join() ######################################################### get dataframe ######################################################### annotation_df_cog_cate_GeneNumber = '%s/%s_COG2014_cate_GeneNumber.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_GeneNumber_pct = '%s/%s_COG2014_cate_GeneNumber_pct.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_GeneNumber_pct_by_all = '%s/%s_COG2014_cate_GeneNumber_pct_by_all.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_TotalDepth = '%s/%s_COG2014_cate_TotalDepth.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_TotalDepth_pct = '%s/%s_COG2014_cate_TotalDepth_pct.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_cate_TotalDepth_pct_by_all = '%s/%s_COG2014_cate_TotalDepth_pct_by_all.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_GeneNumber = '%s/%s_COG2014_id_GeneNumber.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_GeneNumber_pct = '%s/%s_COG2014_id_GeneNumber_pct.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_GeneNumber_pct_by_all = '%s/%s_COG2014_id_GeneNumber_pct_by_all.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_TotalDepth = '%s/%s_COG2014_id_TotalDepth.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_TotalDepth_pct = '%s/%s_COG2014_id_TotalDepth_pct.txt' % ( output_folder, file_in_folder_name) annotation_df_cog_id_TotalDepth_pct_by_all = '%s/%s_COG2014_id_TotalDepth_pct_by_all.txt' % ( output_folder, file_in_folder_name) print(datetime.now().strftime(time_format) + 'Data matrix exported to:') # get df get_COG_annot_df(output_folder, 'cog_cate', annotation_df_cog_cate_GeneNumber, annotation_df_cog_cate_GeneNumber_pct, annotation_df_cog_cate_GeneNumber_pct_by_all, with_depth=False, pct_by_all=False) get_COG_annot_df(output_folder, 'cog_id', annotation_df_cog_id_GeneNumber, annotation_df_cog_id_GeneNumber_pct, annotation_df_cog_id_GeneNumber_pct_by_all, with_depth=False, pct_by_all=False) if pct_by_all is True: get_COG_annot_df(output_folder, 'cog_cate', annotation_df_cog_cate_GeneNumber, annotation_df_cog_cate_GeneNumber_pct, annotation_df_cog_cate_GeneNumber_pct_by_all, with_depth=False, pct_by_all=True) get_COG_annot_df(output_folder, 'cog_id', annotation_df_cog_id_GeneNumber, annotation_df_cog_id_GeneNumber_pct, annotation_df_cog_id_GeneNumber_pct_by_all, with_depth=False, pct_by_all=True) # report if pct_by_all is False: print(datetime.now().strftime(time_format) + '%s and %s' % (annotation_df_cog_id_GeneNumber.split('/')[-1], annotation_df_cog_id_GeneNumber_pct.split('/')[-1])) print(datetime.now().strftime(time_format) + '%s and %s' % (annotation_df_cog_cate_GeneNumber.split('/')[-1], annotation_df_cog_cate_GeneNumber_pct.split('/')[-1])) else: print( datetime.now().strftime(time_format) + '%s, %s and %s' % (annotation_df_cog_id_GeneNumber.split('/')[-1], annotation_df_cog_id_GeneNumber_pct.split('/')[-1], annotation_df_cog_id_GeneNumber_pct_by_all.split('/')[-1]) ) print(datetime.now().strftime(time_format) + '%s, %s and %s' % (annotation_df_cog_cate_GeneNumber.split('/')[-1], annotation_df_cog_cate_GeneNumber_pct.split('/')[-1], annotation_df_cog_cate_GeneNumber_pct_by_all.split('/') [-1])) if depth_file is not None: get_COG_annot_df(output_folder, 'cog_cate', annotation_df_cog_cate_TotalDepth, annotation_df_cog_cate_TotalDepth_pct, annotation_df_cog_cate_TotalDepth_pct_by_all, with_depth=True, pct_by_all=False) get_COG_annot_df(output_folder, 'cog_id', annotation_df_cog_id_TotalDepth, annotation_df_cog_id_TotalDepth_pct, annotation_df_cog_id_TotalDepth_pct_by_all, with_depth=True, pct_by_all=False) if pct_by_all is True: get_COG_annot_df( output_folder, 'cog_cate', annotation_df_cog_cate_TotalDepth, annotation_df_cog_cate_TotalDepth_pct, annotation_df_cog_cate_TotalDepth_pct_by_all, with_depth=True, pct_by_all=True) get_COG_annot_df( output_folder, 'cog_id', annotation_df_cog_id_TotalDepth, annotation_df_cog_id_TotalDepth_pct, annotation_df_cog_id_TotalDepth_pct_by_all, with_depth=True, pct_by_all=True) # report if pct_by_all is False: print(datetime.now().strftime(time_format) + '%s and %s' % (annotation_df_cog_id_TotalDepth.split('/')[-1], annotation_df_cog_id_TotalDepth_pct.split('/')[-1])) print( datetime.now().strftime(time_format) + '%s and %s' % (annotation_df_cog_cate_TotalDepth.split('/')[-1], annotation_df_cog_cate_TotalDepth_pct.split('/')[-1])) else: print(datetime.now().strftime(time_format) + '%s, %s and %s' % (annotation_df_cog_id_TotalDepth.split('/')[-1], annotation_df_cog_id_TotalDepth_pct.split('/')[-1], annotation_df_cog_id_TotalDepth_pct_by_all.split( '/')[-1])) print( datetime.now().strftime(time_format) + '%s, %s and %s' % (annotation_df_cog_cate_TotalDepth.split('/')[-1], annotation_df_cog_cate_TotalDepth_pct.split('/')[-1], annotation_df_cog_cate_TotalDepth_pct_by_all.split( '/')[-1])) ################################################## Final report #################################################### print(datetime.now().strftime(time_format) + 'Done!')
def rename_seq(args): seq_file_in = args['in'] file_extension = args['x'] sep_in = args['sep_in'] sep_out = args['sep_out'] column_to_keep = args['n'] add_prefix = args['prefix'] one_line = args['oneline'] num_threads = args['t'] if os.path.isfile(seq_file_in) is True: ctg_file_path, ctg_file_basename, ctg_file_ext = sep_path_basename_ext( seq_file_in) seq_file_out = '%s/%s_renamed%s' % (ctg_file_path, ctg_file_basename, ctg_file_ext) if os.path.isfile(seq_file_in) is True: print('Output file detected, program exited: %s' % seq_file_out) exit() else: rename_seq_worker([ seq_file_in, seq_file_out, sep_in, sep_out, column_to_keep, add_prefix, one_line ]) if os.path.isdir(seq_file_in) is True: if seq_file_in[-1] == '/': seq_file_in = seq_file_in[:-1] seq_in_folder_no_path = seq_file_in.split('/')[-1] seq_out_folder = '%s_renamed' % seq_in_folder_no_path seq_in_re = '%s/*.%s' % (seq_file_in, file_extension) seq_in_list = [ os.path.basename(file_name) for file_name in glob.glob(seq_in_re) ] if len(seq_in_list) == 0: print('No sequence file detected, program exited!') exit() if os.path.isdir(seq_out_folder) is True: print('Output folder detected, program exited: %s' % seq_out_folder) exit() else: os.mkdir(seq_out_folder) prefix_dict = {} if add_prefix is not None: if os.path.isfile(add_prefix) is False: print('Prefix file not detected, program exited!') exit() else: # read in prefix for each_genome in open(add_prefix): each_genome_split = each_genome.strip().split('\t') prefix_dict[each_genome_split[0]] = each_genome_split[1] genome_without_prefix = set() for each_seq_file in seq_in_list: if each_seq_file not in prefix_dict: genome_without_prefix.add(each_seq_file) if len(genome_without_prefix) > 0: print( 'Prefix for the following files not found, , program exited!' ) print(','.join(genome_without_prefix)) exit() argument_lol_for_rename_seq_worker = [] for each_seq_file in seq_in_list: pwd_seq_in = '%s/%s' % (seq_file_in, each_seq_file) pwd_seq_out = '%s/%s' % (seq_out_folder, each_seq_file) current_argument_list = [ pwd_seq_in, pwd_seq_out, sep_in, sep_out, column_to_keep, prefix_dict.get(each_seq_file, None), one_line ] argument_lol_for_rename_seq_worker.append(current_argument_list) # rename sequence files with multiprocessing pool = mp.Pool(processes=num_threads) pool.map(rename_seq_worker, argument_lol_for_rename_seq_worker) pool.close() pool.join() print('Done!')
def COG2020_worker(argument_list): pwd_input_file = argument_list[0] pwd_prot2003_2014 = argument_list[1] protein_id_to_cog_id_dict = argument_list[2] cog_id_to_category_dict = argument_list[3] cog_id_to_description_dict = argument_list[4] cog_category_list = argument_list[5] cog_category_to_description_dict = argument_list[6] sequence_type = argument_list[7] output_folder = argument_list[8] thread_num = argument_list[9] run_diamond = argument_list[10] evalue_cutoff = argument_list[11] depth_file = argument_list[12] pct_by_all = argument_list[13] input_seq_no_path, input_seq_no_ext, input_seq_ext = sep_path_basename_ext( pwd_input_file) current_output_folder = '%s/%s_COG2020_wd' % (output_folder, input_seq_no_ext) pwd_blastp_output = '%s/%s_blastp.tab' % (current_output_folder, input_seq_no_ext) pwd_blastp_output_besthits = '%s/%s_blastp_besthits.tab' % ( current_output_folder, input_seq_no_ext) pwd_query_to_cog_txt = '%s/%s_query_to_cog.txt' % (current_output_folder, input_seq_no_ext) pwd_cog_stats_GeneNumber = '%s/%s_cog_stats_GeneNumber.txt' % ( current_output_folder, input_seq_no_ext) pwd_cog_stats_TotalDepth = '%s/%s_cog_stats_TotalDepth.txt' % ( current_output_folder, input_seq_no_ext) pwd_func_stats_GeneNumber = '%s/%s_func_stats_GeneNumber.txt' % ( current_output_folder, input_seq_no_ext) pwd_func_stats_TotalDepth = '%s/%s_func_stats_TotalDepth.txt' % ( current_output_folder, input_seq_no_ext) pwd_cog_stats_GeneNumber_pct = '%s/%s_cog_stats_GeneNumber_pct.txt' % ( current_output_folder, input_seq_no_ext) pwd_cog_stats_TotalDepth_pct = '%s/%s_cog_stats_TotalDepth_pct.txt' % ( current_output_folder, input_seq_no_ext) pwd_func_stats_GeneNumber_pct = '%s/%s_func_stats_GeneNumber_pct.txt' % ( current_output_folder, input_seq_no_ext) pwd_func_stats_TotalDepth_pct = '%s/%s_func_stats_TotalDepth_pct.txt' % ( current_output_folder, input_seq_no_ext) pwd_cog_stats_GeneNumber_pct_by_all = '%s/%s_cog_stats_GeneNumber_pct_by_all.txt' % ( current_output_folder, input_seq_no_ext) pwd_cog_stats_TotalDepth_pct_by_all = '%s/%s_cog_stats_TotalDepth_pct_by_all.txt' % ( current_output_folder, input_seq_no_ext) pwd_func_stats_GeneNumber_pct_by_all = '%s/%s_func_stats_GeneNumber_pct_by_all.txt' % ( current_output_folder, input_seq_no_ext) pwd_func_stats_TotalDepth_pct_by_all = '%s/%s_func_stats_TotalDepth_pct_by_all.txt' % ( current_output_folder, input_seq_no_ext) force_create_folder(current_output_folder) input_seq_aa = '' if sequence_type in ['N', 'n']: input_seq_aa = '%s_aa.fasta' % input_seq_no_ext dna2aa(pwd_input_file, input_seq_aa) elif sequence_type in ['P', 'p']: input_seq_aa = pwd_input_file else: print('Specified input sequence type unrecognizable, program exited!') exit() # run blastp if run_diamond is False: os.system( 'blastp -query %s -db %s -out %s -evalue %s -outfmt 6 -show_gis -num_threads %s' % (input_seq_aa, pwd_prot2003_2014, pwd_blastp_output, evalue_cutoff, thread_num)) else: os.system( 'diamond blastp -q %s --db %s.dmnd --out %s --evalue %s --outfmt 6 --threads %s --quiet' % (input_seq_aa, pwd_prot2003_2014, pwd_blastp_output, evalue_cutoff, thread_num)) # keep only best hits best_hit({'i': pwd_blastp_output, 'o': pwd_blastp_output_besthits}) # get query_to_ref_protein_dict query_to_ref_protein_dict = {} for each_hit in open(pwd_blastp_output_besthits): each_hit_split = each_hit.strip().split('\t') each_hit_query = each_hit_split[0] each_hit_subject = each_hit_split[1] each_hit_subject_no_dot = '_'.join(each_hit_subject.split('.')) query_to_ref_protein_dict[each_hit_query] = each_hit_subject_no_dot # get query sequences list query_seq_list = [] for query_seq in SeqIO.parse(pwd_input_file, 'fasta'): query_seq_list.append(query_seq.id) # export annotation cog_id_num_dict = {} cog_id_to_gene_member_dict = {} cog_cate_num_dict = {} cog_cate_to_gene_member_dict = {} genes_with_cog = set() pwd_query_to_cog_txt_handle = open(pwd_query_to_cog_txt, 'w') pwd_query_to_cog_txt_handle.write('Query\tCOG\tCategory\tDescription\n') for query_gene in sorted(query_seq_list): if query_gene not in query_to_ref_protein_dict: pwd_query_to_cog_txt_handle.write('%s\n' % (query_gene)) else: db_protein_id = query_to_ref_protein_dict[query_gene] if db_protein_id not in protein_id_to_cog_id_dict: pwd_query_to_cog_txt_handle.write('%s\n' % (query_gene)) else: cog_id_list = protein_id_to_cog_id_dict[db_protein_id] for cog_id in cog_id_list: cog_cate = cog_id_to_category_dict[cog_id] cog_des = cog_id_to_description_dict[cog_id] pwd_query_to_cog_txt_handle.write( '%s\t%s\t%s\t%s\n' % (query_gene, cog_id, cog_cate, cog_des)) genes_with_cog.add(query_gene) # update cog_id_num_dict if cog_id not in cog_id_num_dict: cog_id_num_dict[cog_id] = 1 cog_id_to_gene_member_dict[cog_id] = [query_gene] else: cog_id_num_dict[cog_id] += 1 cog_id_to_gene_member_dict[cog_id].append(query_gene) # update cog_cate_num_dict for each_cog_cate in cog_cate: if each_cog_cate not in cog_cate_num_dict: cog_cate_num_dict[each_cog_cate] = 1 cog_cate_to_gene_member_dict[each_cog_cate] = [ query_gene ] else: cog_cate_num_dict[each_cog_cate] += 1 cog_cate_to_gene_member_dict[each_cog_cate].append( query_gene) pwd_query_to_cog_txt_handle.close() # read in depth info gene_depth_dict = {} if depth_file is not None: for each_depth in open(depth_file): each_depth_split = each_depth.strip().split('\t') gene_depth_dict[each_depth_split[0]] = float(each_depth_split[1]) # get TotalDepth of all query genes or genes with cog assignment if depth_file is not None: genes_with_cog_TotalDepth = get_gene_list_TotalDepth( genes_with_cog, gene_depth_dict) total_depth_for_all_query_genes = get_gene_list_TotalDepth( query_seq_list, gene_depth_dict) #################### export cog_stats_GeneNumber #################### pwd_cog_stats_GeneNumber_handle = open(pwd_cog_stats_GeneNumber, 'w') pwd_cog_stats_GeneNumber_handle.write('COG\tGeneNumber\tDescription\n') for each_cog_id in cog_id_num_dict: each_cog_id_GeneNumber = cog_id_num_dict[each_cog_id] pwd_cog_stats_GeneNumber_handle.write( '%s\t%s\t%s\n' % (each_cog_id, each_cog_id_GeneNumber, cog_id_to_description_dict[each_cog_id])) pwd_cog_stats_GeneNumber_handle.close() #################### export cog_stats_TotalDepth #################### if depth_file is not None: pwd_cog_stats_TotalDepth_handle = open(pwd_cog_stats_TotalDepth, 'w') pwd_cog_stats_TotalDepth_handle.write('COG\tTotalDepth\tDescription\n') for each_cog_id in cog_id_to_gene_member_dict: each_cog_id_gene_member = cog_id_to_gene_member_dict[each_cog_id] each_cog_id_TotalDepth = 0 for each_gene in each_cog_id_gene_member: each_gene_depth = gene_depth_dict[each_gene] each_cog_id_TotalDepth += each_gene_depth each_cog_id_TotalDepth = float( "{0:.2f}".format(each_cog_id_TotalDepth)) pwd_cog_stats_TotalDepth_handle.write( '%s\t%s\t%s\n' % (each_cog_id, each_cog_id_TotalDepth, cog_id_to_description_dict[each_cog_id])) pwd_cog_stats_TotalDepth_handle.close() #################### export func_stats_GeneNumber #################### pwd_func_stats_GeneNumber_handle = open(pwd_func_stats_GeneNumber, 'w') pwd_func_stats_GeneNumber_handle.write( 'Category\tGeneNumber\tDescription\n') for each_cog_cate in cog_category_list: each_cog_cate_GeneNumber = 0 if each_cog_cate in cog_cate_num_dict: each_cog_cate_GeneNumber = cog_cate_num_dict[each_cog_cate] pwd_func_stats_GeneNumber_handle.write( '%s\t%s\t%s\n' % (each_cog_cate, each_cog_cate_GeneNumber, cog_category_to_description_dict[each_cog_cate])) pwd_func_stats_GeneNumber_handle.close() #################### export func_stats_TotalDepth #################### if depth_file is not None: pwd_func_stats_TotalDepth_handle = open(pwd_func_stats_TotalDepth, 'w') pwd_func_stats_TotalDepth_handle.write( 'Category\tTotalDepth\tDescription\n') for each_cog_cate in cog_category_list: each_cog_cate_TotalDepth = 0 if each_cog_cate in cog_cate_to_gene_member_dict: each_cog_cate_gene_member = cog_cate_to_gene_member_dict[ each_cog_cate] for each_gene in each_cog_cate_gene_member: each_gene_depth = gene_depth_dict[each_gene] each_cog_cate_TotalDepth += each_gene_depth each_cog_cate_TotalDepth = float( "{0:.2f}".format(each_cog_cate_TotalDepth)) pwd_func_stats_TotalDepth_handle.write( '%s\t%s\t%s\n' % (each_cog_cate, each_cog_cate_TotalDepth, cog_category_to_description_dict[each_cog_cate])) pwd_func_stats_TotalDepth_handle.close() #################### get pct files #################### AnnotateNorm(file_in=pwd_cog_stats_GeneNumber, skip_header=True, value_column=2, Divisor_value=len(genes_with_cog), file_out=pwd_cog_stats_GeneNumber_pct, file_out_header='Category\tGeneNumber_pct\tDescription\n') AnnotateNorm(file_in=pwd_func_stats_GeneNumber, skip_header=True, value_column=2, Divisor_value=len(genes_with_cog), file_out=pwd_func_stats_GeneNumber_pct, file_out_header='Category\tGeneNumber_pct\tDescription\n') if depth_file is not None: AnnotateNorm(file_in=pwd_cog_stats_TotalDepth, skip_header=True, value_column=2, Divisor_value=genes_with_cog_TotalDepth, file_out=pwd_cog_stats_TotalDepth_pct, file_out_header='Category\tTotalDepth_pct\tDescription\n') AnnotateNorm(file_in=pwd_func_stats_TotalDepth, skip_header=True, value_column=2, Divisor_value=genes_with_cog_TotalDepth, file_out=pwd_func_stats_TotalDepth_pct, file_out_header='Category\tTotalDepth_pct\tDescription\n') if pct_by_all is True: AnnotateNorm( file_in=pwd_cog_stats_GeneNumber, skip_header=True, value_column=2, Divisor_value=len(query_seq_list), file_out=pwd_cog_stats_GeneNumber_pct_by_all, file_out_header='Category\tGeneNumber_pct_by_all\tDescription\n') AnnotateNorm( file_in=pwd_func_stats_GeneNumber, skip_header=True, value_column=2, Divisor_value=len(query_seq_list), file_out=pwd_func_stats_GeneNumber_pct_by_all, file_out_header='Category\tGeneNumber_pct_by_all\tDescription\n') if depth_file is not None: AnnotateNorm( file_in=pwd_cog_stats_TotalDepth, skip_header=True, value_column=2, Divisor_value=total_depth_for_all_query_genes, file_out=pwd_cog_stats_TotalDepth_pct_by_all, file_out_header='Category\tTotalDepth_pct_by_all\tDescription\n' ) AnnotateNorm( file_in=pwd_func_stats_TotalDepth, skip_header=True, value_column=2, Divisor_value=total_depth_for_all_query_genes, file_out=pwd_func_stats_TotalDepth_pct_by_all, file_out_header='Category\tTotalDepth_pct_by_all\tDescription\n' )
def BLCA_op_parser(args): blca_output = args['in'] file_in_path, file_in_basename, file_in_ext = sep_path_basename_ext( blca_output) output_file_1 = '%s/%s_reformatted_1.txt' % (file_in_path, file_in_basename) output_file_2 = '%s/%s_reformatted_2.txt' % (file_in_path, file_in_basename) # read in input file s16_taxon_blca_dict = {} for each_16s_taxon in open(blca_output): each_16s_taxon_split = each_16s_taxon.strip().split('\t') s16_taxon_blca_dict[each_16s_taxon_split[0]] = each_16s_taxon_split[1] taxon_dict_formatted_with_num = {} taxon_dict_formatted_no_num = {} for each_16s in s16_taxon_blca_dict: taxon_blca_raw = s16_taxon_blca_dict[each_16s] formatted_taxon_str_with_num = 'Unclassified' formatted_taxon_str_no_num = 'Unclassified' if taxon_blca_raw != 'Unclassified': taxon_blca_raw_split_1 = taxon_blca_raw.strip().split(':')[1:] formatted_taxon_list_with_num = [] formatted_taxon_list_no_num = [] for each_str in taxon_blca_raw_split_1: each_str_split = each_str.split(';') # determine_current_rank current_rank = '' if each_str_split[-1] == 'phylum': current_rank = 'd' elif each_str_split[-1] == 'class': current_rank = 'p' elif each_str_split[-1] == 'order': current_rank = 'c' elif each_str_split[-1] == 'family': current_rank = 'o' elif each_str_split[-1] == 'genus': current_rank = 'f' elif each_str_split[-1] == 'species': current_rank = 'g' elif each_str_split[-1] == '': current_rank = 's' taxon_with_confidence = '%s(%s)' % (each_str_split[0], each_str_split[1][:5]) taxon_without_confidence = '%s__%s' % (current_rank, each_str_split[0]) formatted_taxon_list_with_num.append(taxon_with_confidence) formatted_taxon_list_no_num.append(taxon_without_confidence) formatted_taxon_str_with_num = ';'.join( formatted_taxon_list_with_num) formatted_taxon_str_no_num = ';'.join(formatted_taxon_list_no_num) formatted_taxon_str_with_numno_space = '_'.join( formatted_taxon_str_with_num.split(' ')) formatted_taxon_str_no_num_no_space = '_'.join( formatted_taxon_str_no_num.split(' ')) taxon_dict_formatted_with_num[ each_16s] = formatted_taxon_str_with_numno_space taxon_dict_formatted_no_num[ each_16s] = formatted_taxon_str_no_num_no_space output_file_1_handle = open(output_file_1, 'w') output_file_2_handle = open(output_file_2, 'w') for each_seq in taxon_dict_formatted_with_num: output_file_1_handle.write( '%s\t%s\n' % (each_seq, taxon_dict_formatted_with_num[each_seq])) output_file_2_handle.write( '%s\t%s\n' % (each_seq, taxon_dict_formatted_no_num[each_seq])) output_file_1_handle.close() output_file_2_handle.close()