def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path, simple_tree): fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree' for RNA_cluster_nu_filename in fa_files_list: try: # extract GC_RNA002 from path/GC_RNA002.aln clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0] geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a') if len( read_fasta(RNA_cluster_nu_filename) )==1: # nothing to do for singletons ## na.aln RNA_cluster_nu_aln_filename= RNA_cluster_nu_filename.replace('.fna','_na.aln') ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility with open(RNA_cluster_nu_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(RNA_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0')) else: # align and build tree print RNA_cluster_nu_filename myTree = mpm_tree(RNA_cluster_nu_filename) myTree.align() if simple_tree==False: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True) myTree.ancestral(translate_tree=True) myTree.refine(CDS=False) else: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False) myTree.diversity_statistics_nuc() myTree.export(path=alignFile_path, RNA_specific=True) RNA_diversity_values='{0:.3f}'.format(myTree.diversity_nuc) geneDiversity_file.write('%s\t%s\n'%(clusterID,RNA_diversity_values)) print clusterID,RNA_diversity_values except: print("Aligning and tree building of RNA %s failed"%RNA_cluster_nu_filename)
def align_and_makeTree( fna_file_list, alignFile_path, simple_tree): fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree' for gene_cluster_nu_filename in fna_file_list: try: # extract GC_00002 from path/GC_00002.aln clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0] start = time.time(); geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a') if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons ## na_aln.fa gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na_aln.fa') ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility with open(gene_cluster_nu_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) os.system( ' '.join(['cp',gene_cluster_nu_aln_filename,gene_cluster_nu_aln_filename.replace('_aln','_aln_reduced')]) ) ## aa_aln.fa gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa') gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa_aln.fa') ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility with open(gene_cluster_aa_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) os.system( ' '.join(['cp',gene_cluster_aa_aln_filename,gene_cluster_aa_aln_filename.replace('_aln','_aln_reduced')]) ) geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0')) else: # align and build tree #print gene_cluster_nu_filename myTree = mpm_tree(gene_cluster_nu_filename) myTree.codon_align() myTree.translate() if simple_tree==False: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True) myTree.ancestral(translate_tree=True) myTree.refine() else: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False) myTree.diversity_statistics_nuc() myTree.export(path=alignFile_path) #myTree.diversity_statistics_aa() #random_alnID=myTree.seqs.keys()[0].split('-')[0] diversity_nuc= round(myTree.diversity_nuc,3)#diversity_aa=round(myTree.diversity_aa,3) #bestSplit_paraNodes,bestSplit_branchLen = myTree.paralogy_statistics() #mean_seqLen, std_seqLen= myTree.mean_std_seqLen() #mean_seqLen, std_seqLen= [ round(i,3) for i in mean_seqLen, std_seqLen ] geneDiversity_file.write('%s\t%s\n'%(clusterID,diversity_nuc)) if 0: cluster_correl_stats_file = open(alignFile_path+'cluster_correl_stats.txt', 'a') cluster_correl_stats_file.write('%s\n'%'\t'.join([ str(i) for i in [clusterID, random_alnID, diversity_nuc, \ mean_seqLen, std_seqLen, bestSplit_paraNodes, bestSplit_branchLen ] ])) except: print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
def build_representative_cluster(clustering_path, threads, input_prefix): """ build representative cluster """ start = time.time() cluster_file = ''.join([clustering_path, input_prefix, '_cluster.output']) representative_outputfile = ''.join( [clustering_path, input_prefix, '_representative', '.faa']) subproblem_seqs_path = '%ssubproblem_cluster_seqs/' % clustering_path subproblem_merged_faa = ''.join([clustering_path, input_prefix, '.faa']) subproblem_faa_dict = read_fasta(subproblem_merged_faa) with open(cluster_file, 'rb') as cluster_input: subproblem_geneCluster_dt = defaultdict(list) cluster_input_lines = [iline for iline in cluster_input] subproblem_geneCluster_dt = {} subproblem_run_number = input_prefix.split('subproblem_')[1] for gid, iline in enumerate(cluster_input_lines): #cluster_input ## use time to avoid clusterID conflict clusterID = "GCs%s_%07d%s" % (subproblem_run_number, gid, time.strftime('%M%S', time.gmtime())) gene_ids = iline.rstrip().split('\t') subproblem_geneCluster_dt[clusterID] = gene_ids ## representative_seq representative_seq = subproblem_faa_dict[gene_ids[0]] ## write in representative strain with open(representative_outputfile, 'a') as representative_output: write_in_fa(representative_output, clusterID, representative_seq) ## write subproblem_geneCluster_dt write_pickle(''.join([clustering_path, input_prefix, '_dicts.cpk']), subproblem_geneCluster_dt) print 'build representative clusters for', input_prefix, ': ', times( start), '\n'
def gather_seq_length(faa_path): """ """ seq_length_dt = defaultdict() for faa_file in glob.iglob(''.join([faa_path, '*faa'])): for gene_tag, seq in read_fasta(faa_file).iteritems(): seq_length_dt[gene_tag] = len(seq) return seq_length_dt
def gather_seq_length(faa_path): """ """ seq_length_dt=defaultdict() for faa_file in glob.iglob(''.join([faa_path,'*faa'])): for gene_tag, seq in read_fasta(faa_file).iteritems(): seq_length_dt[gene_tag]=len(seq) return seq_length_dt
def build_representative_cluster(clustering_path, threads, input_prefix): """ build representative cluster """ start = time.time() cluster_file= ''.join([clustering_path,input_prefix,'_cluster.output']) representative_outputfile= ''.join([clustering_path,input_prefix,'_representative','.faa']) subproblem_seqs_path= '%ssubproblem_cluster_seqs/'%clustering_path subproblem_merged_faa= ''.join([clustering_path,input_prefix,'.faa']) subproblem_faa_dict= read_fasta(subproblem_merged_faa) with open(cluster_file, 'rb') as cluster_input: subproblem_geneCluster_dt= defaultdict(list) cluster_input_lines= [iline for iline in cluster_input] subproblem_geneCluster_dt= {} subproblem_run_number= input_prefix.split('subproblem_')[1] for gid, iline in enumerate(cluster_input_lines):#cluster_input ## use time to avoid clusterID conflict clusterID= "GCs%s_%07d%s"%(subproblem_run_number, gid, time.strftime('%M%S',time.gmtime())) gene_ids= iline.rstrip().split('\t') subproblem_geneCluster_dt[clusterID]= gene_ids ## representative_seq representative_seq=subproblem_faa_dict[gene_ids[0]] ## write in representative strain with open(representative_outputfile, 'a') as representative_output: write_in_fa(representative_output, clusterID, representative_seq) ## write subproblem_geneCluster_dt write_pickle(''.join([clustering_path,input_prefix,'_dicts.cpk']), subproblem_geneCluster_dt) print 'build representative clusters for', input_prefix,': ', times(start), '\n'
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path, simple_tree): fasttree_name = 'fasttree' if check_dependency('fasttree') else 'FastTree' for RNA_cluster_nu_filename in fa_files_list: if 1: #try: # extract GC_RNA002 from path/GC_RNA002.aln clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0] geneDiversity_file = open(alignFile_path + 'gene_diversity.txt', 'a') if len(read_fasta(RNA_cluster_nu_filename) ) == 1: # nothing to do for singletons ## na.aln RNA_cluster_nu_aln_filename = RNA_cluster_nu_filename.replace( '.fna', '_na.aln') ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility with open(RNA_cluster_nu_aln_filename, 'wb') as write_file: for SeqID, Sequence in read_fasta( RNA_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|', '-'), Sequence) geneDiversity_file.write('%s\t%s\n' % (clusterID, '0.0')) else: # align and build tree print RNA_cluster_nu_filename myTree = mpm_tree(RNA_cluster_nu_filename) myTree.align() if simple_tree == False: myTree.build(raxml=False, fasttree_program=fasttree_name, treetime_used=True) myTree.ancestral(translate_tree=True) myTree.refine() else: myTree.build(raxml=False, fasttree_program=fasttree_name, treetime_used=False) myTree.diversity_statistics_nuc() myTree.export(path=alignFile_path, RNA_specific=True) RNA_diversity_values = '{0:.3f}'.format(myTree.diversity_nuc) geneDiversity_file.write('%s\t%s\n' % (clusterID, RNA_diversity_values)) print clusterID, RNA_diversity_values if 0: #except: print("Aligning and tree building of RNA %s failed" % RNA_cluster_nu_filename)
def make_gene_presence_absence_matrix(input_filepath): os.chdir(input_filepath) gene_order= ','.join([gene.rstrip() for gene, content in load_sorted_clusters('./')]) with open('./geneCluster/genePresence.aln') as inputf,\ open(output_filepath,'wb') as outputf: outputf.write('accession,%s\n'%gene_order) for strain, genes in read_fasta(inputf).iteritems(): outputf.write('%s,%s\n'%(strain,','.join(genes)))
def find_and_merge_unclustered_genes(path, nstrains, window_size=5, strain_proportion=0.3, sigma_scale=3): """ detect the unclustered genes and concatenate them params: nstrains: total number of strains window_size strain_proportion sigma_scale return: a dict with key of the merged cluster and value of a list of related unclustered cluster-name for deletion """ file_path = '%s%s' % (path, 'geneCluster/') gene_clusters = load_sorted_clusters(path) length_to_cluster = defaultdict(list) length_list = [] ## calculate cluster length distribution, link clusterIDs with their clusterLength for gid, (clusterID, gene) in enumerate(gene_clusters): # average length of the cluster in amino acids clusterLength = int( np.mean([ len(igene) for igene in read_fasta(file_path + '%s%s' % (clusterID, '.fna')).values() ]) / 3.0) length_to_cluster[clusterLength].append(clusterID) length_list.append(clusterLength) cluster_length_distribution = np.bincount(length_list) ## calculate smoothed cluster length distribution window_size = 5 window = np.ones(window_size, dtype=float) / window_size smoothed_length_distribution = np.convolve(cluster_length_distribution, window, mode='same') ## detect peaks peaks = (cluster_length_distribution - smoothed_length_distribution) > np.maximum( strain_proportion * nstrains, sigma_scale * np.sqrt(smoothed_length_distribution)) position_peaks = np.where(peaks)[0] #cluster_len_peaks= position_peaks*3 ## concatenate clusters with the same aver. length, return dict of these clusters merged_clusters_dict = defaultdict(dict) for index, i_peak in enumerate(position_peaks, 1): merged_cluster_filename, cluster_needed_deletion = concatenate_cluster_files( length_to_cluster[i_peak], index, file_path) merged_clusters_dict[merged_cluster_filename] = cluster_needed_deletion return merged_clusters_dict
def calculate_aln_consensus(aln_file): """ """ aln_dt= read_fasta(aln_file) alphabet = 'ACDEFGHIKLMNPQRSTVWY*-X'#alphabet = 'ACGT-N' if len(aln_dt) == 1: ## only one seq ## if letters not in alphabet: consensus_arr_seq=''.join([ ic if ic in alphabet else 'X' for ic in aln_dt.values()[0] ]) else: ## consensus of multiple seqs try: aln_array = np.array([ i for i in aln_dt.values()]) aln_array = aln_array.view('S1').reshape((aln_array.size, -1)) af = np.zeros((len(alphabet), aln_array.shape[1])) for ai, state in enumerate(alphabet): af[ai] += (aln_array==state).mean(axis=0) ## assign invalid character to the last letter in alphabet (N for nuc or X for aa ) af[-1] = 1.0 - af[:-1].sum(axis=0) consensus_arr_seq=''.join([ alphabet[ic] for ic in af.argmax(axis=0) ]) except: print 'errors in calculating consensus seq: ', aln_file return consensus_arr_seq
def find_and_merge_unclustered_genes( path, nstrains, window_size=5, strain_proportion=0.3 , sigma_scale=3): """ detect the unclustered genes and concatenate them params: nstrains: total number of strains window_size strain_proportion sigma_scale return: a dict with key of the merged cluster and value of a list of related unclustered cluster-name for deletion """ file_path='%s%s'%(path,'geneCluster/') gene_clusters = load_sorted_clusters(path) length_to_cluster = defaultdict(list) length_list = [] ## calculate cluster length distribution, link clusterIDs with their clusterLength for gid, (clusterID, gene) in enumerate(gene_clusters): # average length of the cluster in amino acids clusterLength= int(np.mean([len(igene) for igene in read_fasta(file_path+'%s%s'%(clusterID,'.fna')).values()])/3.0) length_to_cluster[clusterLength].append(clusterID) length_list.append(clusterLength) cluster_length_distribution = np.bincount(length_list) ## calculate smoothed cluster length distribution window_size=5 window = np.ones(window_size, dtype=float)/window_size smoothed_length_distribution = np.convolve(cluster_length_distribution, window, mode='same') ## detect peaks peaks = (cluster_length_distribution - smoothed_length_distribution)> np.maximum(strain_proportion*nstrains, sigma_scale*np.sqrt(smoothed_length_distribution)) position_peaks =np.where(peaks)[0]; #cluster_len_peaks= position_peaks*3 ## concatenate clusters with the same aver. length, return dict of these clusters merged_clusters_dict=defaultdict(dict) for index, i_peak in enumerate(position_peaks,1): merged_cluster_filename, cluster_needed_deletion=concatenate_cluster_files(length_to_cluster[i_peak], index,file_path) merged_clusters_dict[merged_cluster_filename]=cluster_needed_deletion return merged_clusters_dict
def calculate_aln_consensus(aln_file): """ """ aln_dt = read_fasta(aln_file) alphabet = 'ACDEFGHIKLMNPQRSTVWY*-X' #alphabet = 'ACGT-N' if len(aln_dt) == 1: ## only one seq ## if letters not in alphabet: consensus_arr_seq = ''.join( [ic if ic in alphabet else 'X' for ic in aln_dt.values()[0]]) else: ## consensus of multiple seqs try: aln_array = np.array([i for i in aln_dt.values()]) aln_array = aln_array.view('S1').reshape((aln_array.size, -1)) af = np.zeros((len(alphabet), aln_array.shape[1])) for ai, state in enumerate(alphabet): af[ai] += (aln_array == state).mean(axis=0) ## assign invalid character to the last letter in alphabet (N for nuc or X for aa ) af[-1] = 1.0 - af[:-1].sum(axis=0) consensus_arr_seq = ''.join( [alphabet[ic] for ic in af.argmax(axis=0)]) except: print 'errors in calculating consensus seq: ', aln_file return consensus_arr_seq
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''): #1.0 """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln core_cutoff: percentage of strains used to decide whether a gene is core default: 1.0 (strictly core gene, which is present in all strains) customized: 0.9 ( soft core, considered as core if present in 90% of strains) """ import os, sys, operator import numpy as np import numpy.ma as ma from collections import defaultdict from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath = '%s%s' % (path, 'geneCluster/') output_path = alnFilePath ## create core gene list corelist = [] strain_list = load_pickle(path + 'strain_list.cpk') totalStrain = len(strain_list) sorted_geneList = load_sorted_clusters(path) if core_gene_strain_fpath != '': with open(core_gene_strain_fpath, 'rb') as core_gene_strain_file: core_strain_set = set( [i.rstrip().replace('-', '_') for i in core_gene_strain_file]) with open(output_path + 'core_geneList.txt', 'wb') as outfile: for clusterID, vg in sorted_geneList: if core_cutoff == 1.0: strain_core_cutoff = totalStrain else: strain_core_cutoff = int(totalStrain * core_cutoff) if vg[0] == vg[2] and vg[0] >= strain_core_cutoff: coreGeneName = '%s%s' % (clusterID, '_na_aln.fa') ## sequences might be discarded because of premature stops coreGeneName_path = alnFilePath + coreGeneName if os.path.exists(coreGeneName_path) and len( read_fasta(coreGeneName_path)) >= strain_core_cutoff: if core_gene_strain_fpath != '' and len( core_strain_set - set([i.split('|')[0] for i in vg[1]])) != 0: continue outfile.write(coreGeneName + '\n') corelist.append(coreGeneName) else: #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene') pass write_pickle(output_path + 'core_geneList.cpk', corelist) refSeqList = load_pickle(path + 'strain_list.cpk') refSeqList.sort() snp_fre_lst = [] snp_wh_matrix_flag = 0 snp_pos_dt = defaultdict(list) snp_whole_matrix = np.array([]) snps_by_gene = [] for align_file in corelist: ## core genes nuc_array = np.array([]) # array to store nucleotides for each gene gene_seq_dt = read_fasta(alnFilePath + align_file) if core_cutoff != 1.0: # set sequences for missing gene (space*gene_length) missing_gene_seq = ' ' * len(gene_seq_dt.values()[0]) totalStrain_sorted_lst = sorted(strain_list) # build strain_seq_dt from gene_seq_dt strain_seq_dt = defaultdict() for gene, seq in gene_seq_dt.iteritems(): strain_seq_dt[gene.split('-')[0]] = seq # strain-locus_tag-... strain_seq_sorted_lst = sorted(strain_seq_dt.items(), key=lambda x: x[0]) start_flag = 0 if core_cutoff == 1.0: for ka, va in strain_seq_sorted_lst: if start_flag == 0: nuc_array = np.array(np.fromstring(va, dtype='S1')) start_flag = 1 else: nuc_array = np.vstack( (nuc_array, np.fromstring(va, dtype='S1'))) ## find SNP positions position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis=0) position_has_gap = np.any(nuc_array == '-', axis=0) position_SNP = position_polymorphic & (~position_has_gap) snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = np.where(position_SNP)[0] else: ## add '-' for missing genes when dealing with soft core genes core_gene_strain = [gene for gene in strain_seq_dt.keys()] for strain in totalStrain_sorted_lst: if start_flag == 0: if strain in core_gene_strain: nuc_array = np.array( np.fromstring(strain_seq_dt[strain], dtype='S1')) else: print 'Soft core gene: gene absent in strain %s on cluster %s' % ( strain, align_file) nuc_array = np.array( np.fromstring(missing_gene_seq, dtype='S1')) start_flag = 1 else: if strain in core_gene_strain: nuc_array = np.vstack( (nuc_array, np.fromstring(strain_seq_dt[strain], dtype='S1'))) else: print 'Soft core gene: gene absent in strain %s on cluster %s' % ( strain, align_file) nuc_array = np.vstack((nuc_array, np.fromstring(missing_gene_seq, dtype='S1'))) ## find SNP positions ## mask missing genes -- determine rows that have ' ' in every column is_missing = np.all(nuc_array == ' ', axis=1) masked_non_missing_array = np.ma.masked_array( nuc_array, nuc_array == ' ') position_polymorphic = np.any( masked_non_missing_array != masked_non_missing_array[0, :], axis=0) position_has_gap = np.any(masked_non_missing_array == '-', axis=0) position_SNP = position_polymorphic & (~position_has_gap) # the below seems duplicated from 5 lines above?? if is_missing.sum() > 0: # with missing genes nuc_array[is_missing] = '-' snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = np.where(position_SNP)[0] #print snp_columns if snp_wh_matrix_flag == 0: snp_whole_matrix = snp_columns snp_wh_matrix_flag = 1 else: snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt) with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa(outfile, refSeqList[ind], isw.tostring())
def create_split_cluster_files(file_path, fname, gene_list1, gene_list2, geneCluster_dt): """ delete the old cluster and create two new clusters params: new_fa_files: list to which new file names are appeneded gene_list1/2: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated """ orgin_nwk_name = fname.split('/')[-1] clusterID = orgin_nwk_name.replace('.nwk', '') origin_cluster_nu_fa = orgin_nwk_name.replace('nwk', 'fna') origin_cluster_aa_fa = orgin_nwk_name.replace('nwk', 'faa') split_fa_files_set = set() ## load genes from old clusters origin_nu_fa_dt = read_fasta(file_path + origin_cluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path + origin_cluster_aa_fa) sgs_index = 0 ## delete old (split) clusters try: #print('deleting:',orgin_nwk_name) ##debug: ##print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID) del geneCluster_dt[clusterID] with open(file_path + 'old_clusters_paralogSplit.txt', 'a') as delete_cluster_file: delete_cluster_file.write('%s\n' % clusterID) if os.path.exists(fname): suffix_list = [ '_aa_aln.fa', '_na_aln.fa', '.fna', '.faa', '.nwk', '_tree.json' ] else: suffix_list = ['_aa_aln.fa', '_na_aln.fa', '.fna', '.faa'] tmp_files = ' '.join( [file_path + clusterID + suffix for suffix in suffix_list]) command_move_deleted_clusters = ' '.join( ['mv', tmp_files, file_path + 'paralog_splits/']) os.system(command_move_deleted_clusters) except: print("paralog splitting: can't delete", orgin_nwk_name) ##debug: ##print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID) ## write new cluster fa files ## split_gene_list has geneSeqID instead of geneID for split_gene_list in (list(gene_list1), list(gene_list2)): sgs_index += 1 newClusterId = "%s_p%s" % (clusterID, sgs_index) gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna') gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa') gene_cluster_nu_write = open(file_path + gene_cluster_nu_filename, 'wb') gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename, 'wb') split_fa_files_set |= set([file_path + gene_cluster_nu_filename]) ## write new split cluster files for gene_memb in split_gene_list: if "\\'" in gene_memb: gene_memb = gene_memb.replace("\\'", "'") try: write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) except: print 'paralogy splitting (problem to write new split cluster files)', fname #, gene_memb, gene_list1, gene_list2 gene_cluster_nu_write.close() gene_cluster_aa_write.close() geneCluster_dt[newClusterId] = [0, [], 0] ## num_stains geneCluster_dt[newClusterId][0] = len( dict(Counter([ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes geneCluster_dt[newClusterId][2] = len( dict(Counter([ig for ig in split_gene_list])).keys()) ## gene members geneCluster_dt[newClusterId][1] = [ ig.split('-')[0] for ig in split_gene_list ] return split_fa_files_set
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''):#1.0 """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln core_cutoff: percentage of strains used to decide whether a gene is core default: 1.0 (strictly core gene, which is present in all strains) customized: 0.9 ( soft core, considered as core if present in 90% of strains) """ import os,sys,operator import numpy as np import numpy.ma as ma from collections import defaultdict from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath='%s%s'%(path,'geneCluster/') output_path= alnFilePath ## create core gene list corelist=[] strain_list=load_pickle(path+'strain_list.cpk') totalStrain= len(strain_list) sorted_geneList = load_sorted_clusters(path) if core_gene_strain_fpath!='': with open(core_gene_strain_fpath,'rb') as core_gene_strain_file: core_strain_set= set([i.rstrip().replace('-','_') for i in core_gene_strain_file]) with open(output_path+'core_geneList.txt','wb') as outfile: for clusterID, vg in sorted_geneList: if core_cutoff==1.0: strain_core_cutoff=totalStrain else: strain_core_cutoff=int(totalStrain*core_cutoff) if vg[0]==vg[2] and vg[0]>=strain_core_cutoff: coreGeneName='%s%s'%(clusterID,'_na_aln.fa') ## sequences might be discarded because of premature stops coreGeneName_path= alnFilePath+coreGeneName if os.path.exists(coreGeneName_path) and len(read_fasta(coreGeneName_path)) >= strain_core_cutoff: if core_gene_strain_fpath!='' and len(core_strain_set-set([i.split('|')[0] for i in vg[1]]))!=0: continue outfile.write(coreGeneName+'\n') corelist.append(coreGeneName) else: #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene') pass write_pickle(output_path+'core_geneList.cpk',corelist) refSeqList=load_pickle(path+'strain_list.cpk');refSeqList.sort() snp_fre_lst=[]; snp_wh_matrix_flag=0 snp_pos_dt=defaultdict(list); snp_whole_matrix=np.array([]) snps_by_gene=[] for align_file in corelist:## core genes nuc_array=np.array([]) # array to store nucleotides for each gene gene_seq_dt=read_fasta(alnFilePath+align_file) if core_cutoff!=1.0: # set sequences for missing gene (space*gene_length) missing_gene_seq=' '*len(gene_seq_dt.values()[0]) totalStrain_sorted_lst=sorted(strain_list) # build strain_seq_dt from gene_seq_dt strain_seq_dt=defaultdict() for gene, seq in gene_seq_dt.iteritems(): strain_seq_dt[gene.split('-')[0]]=seq # strain-locus_tag-... strain_seq_sorted_lst=sorted(strain_seq_dt.items(), key=lambda x: x[0]) start_flag=0 if core_cutoff==1.0: for ka, va in strain_seq_sorted_lst: if start_flag==0: nuc_array=np.array(np.fromstring(va, dtype='S1')) start_flag=1 else: nuc_array=np.vstack((nuc_array,np.fromstring(va, dtype='S1'))) ## find SNP positions position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis = 0) position_has_gap = np.any(nuc_array=='-', axis=0) position_SNP = position_polymorphic&(~position_has_gap) snp_columns = nuc_array[:,position_SNP] snp_pos_dt[align_file]=np.where(position_SNP)[0] else: ## add '-' for missing genes when dealing with soft core genes core_gene_strain=[ gene for gene in strain_seq_dt.keys()] for strain in totalStrain_sorted_lst: if start_flag==0: if strain in core_gene_strain: nuc_array=np.array(np.fromstring(strain_seq_dt[strain], dtype='S1')) else: print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file) nuc_array=np.array(np.fromstring(missing_gene_seq, dtype='S1')) start_flag=1 else: if strain in core_gene_strain: nuc_array=np.vstack((nuc_array,np.fromstring(strain_seq_dt[strain], dtype='S1'))) else: print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file) nuc_array=np.vstack((nuc_array,np.fromstring(missing_gene_seq, dtype='S1'))) ## find SNP positions ## mask missing genes -- determine rows that have ' ' in every column is_missing = np.all(nuc_array==' ',axis=1) masked_non_missing_array= np.ma.masked_array(nuc_array, nuc_array==' ') position_polymorphic = np.any(masked_non_missing_array!= masked_non_missing_array[0, :],axis = 0) position_has_gap = np.any(masked_non_missing_array=='-',axis=0) position_SNP = position_polymorphic&(~position_has_gap) # the below seems duplicated from 5 lines above?? if is_missing.sum()>0: # with missing genes nuc_array[is_missing]='-' snp_columns = nuc_array[:,position_SNP] snp_pos_dt[align_file]=np.where(position_SNP)[0] #print snp_columns if snp_wh_matrix_flag==0: snp_whole_matrix=snp_columns; snp_wh_matrix_flag=1 else: snp_whole_matrix=np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path+'snp_pos.cpk',snp_pos_dt) with open(output_path+'SNP_whole_matrix.aln','wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa( outfile, refSeqList[ind], isw.tostring() )
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species): """ estimate core gene diversity before gene cluster alignment and cluster post-processing """ totalStrain = len(strain_list) ## load clusters clustering_path = folders_dict['clustering_path'] geneCluster_dt = load_pickle(clustering_path + 'allclusters.cpk') protein_path = folders_dict['protein_path'] nucleotide_path = folders_dict['nucleotide_path'] protein_dict_path = '%s%s' % (protein_path, 'all_protein_seq.cpk') nucleotide_dict_path = '%s%s' % (nucleotide_path, 'all_nucleotide_seq.cpk') tmp_core_seq_path = '%s%s' % (clustering_path, 'tmp_core/') ## load geneID_to_geneSeqID geneSeqID cpk file geneID_to_geneSeqID_dict = load_pickle(path + 'geneID_to_geneSeqID.cpk') ## create core gene list core_geneCluster_dt = defaultdict() # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes } for clusterID, cluster_stats in geneCluster_dt.iteritems(): if core_cutoff == 1.0: strain_core_cutoff = totalStrain else: strain_core_cutoff = int(totalStrain * core_cutoff) ## check whether #genes == #strains and it's a core/soft-core gene if cluster_stats[0] == cluster_stats[ 2] and cluster_stats[0] >= strain_core_cutoff: core_geneCluster_dt[clusterID] = cluster_stats if os.path.exists(tmp_core_seq_path): os.system(''.join(['rm -rf ', tmp_core_seq_path])) os.system('mkdir %s' % tmp_core_seq_path) ## create dict storing all genes' translation if 0: gene_aa_dict = defaultdict(dict) for accession_id in strain_list: gene_aa_dict[accession_id] = read_fasta(''.join( [protein_path, accession_id, '.faa'])) write_pickle(protein_dict_path, gene_aa_dict) ## create dict for all gene's nucleotide sequence gene_na_dict = defaultdict(dict) for accession_id in strain_list: gene_na_dict[accession_id] = read_fasta(''.join( [nucleotide_path, accession_id, '.fna'])) write_pickle(nucleotide_dict_path, gene_na_dict) gene_aa_dict = load_pickle(protein_dict_path) gene_na_dict = load_pickle(nucleotide_dict_path) ## write nucleotide and amino-acid sequences for each gene cluster export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict, gene_na_dict, gene_aa_dict) tmp_fa_files = glob.glob(tmp_core_seq_path + "*.fna") multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species) calculated_core_diversity = tmp_average_core_diversity(tmp_core_seq_path) refined_core_diversity = round( (0.1 + factor_core_diversity * calculated_core_diversity) / (1 + factor_core_diversity * calculated_core_diversity), 4) print('factor used: ' + str(factor_core_diversity)) print('average core genome diversity: ' + str(calculated_core_diversity)) print( 'defined core genome diversity cutoff for splitting long branches: ' + str(refined_core_diversity)) ## move folder tmp_core to the central data folder new_clustering_path = '%stmp_core' % path if os.path.exists(new_clustering_path): os.system(''.join(['rm -r ', new_clustering_path])) os.system('mv %s %s' % (tmp_core_seq_path, path)) return calculated_core_diversity, refined_core_diversity
def output_cutted_clusters(file_path, uncluster_filename, gene_list, cut_branch_threshold, treefile_used=None, cut_leftover=None): """ delete the unclustered file and create new clusters params: gene_list: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated cut_leftover: flag to indicate whether there are the leftover nodes after cutting long branches. Default: empty. """ clusterID = uncluster_filename.replace('.fna', '') origin_uncluster_nu_fa = uncluster_filename origin_uncluster_aa_fa = uncluster_filename.replace('fna', 'faa') new_fa_files = set() ## load origin cluster fa files origin_nu_fa_dt = read_fasta(file_path + origin_uncluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path + origin_uncluster_aa_fa) ## split_gene_list has geneSeqID instead of geneID for sgs_index, split_gene_list in enumerate(gene_list, 1): if cut_leftover == True: ## newClusterId for the rest genes (_r as identifier) newClusterId = "%s_r%s" % (clusterID, sgs_index) else: newClusterId = "%s_%s" % (clusterID, sgs_index) #============================================= ## write new divided/split cluster files gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna') gene_cluster_nu_filepath = file_path + gene_cluster_nu_filename gene_cluster_nu_write = open(gene_cluster_nu_filepath, 'wb') gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa') gene_cluster_aa_filepath = file_path + gene_cluster_aa_filename gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename, 'wb') for gene_memb in split_gene_list: if "\\'" in gene_memb: # Replace '\' in node name: ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError: ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'" gene_memb = gene_memb.replace("\\'", "'") write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) gene_cluster_nu_write.close() gene_cluster_aa_write.close() #============================================= if cut_leftover == True: ## align the rest genes, build tree, cut long branches till nothing can be cutted. cutTree_outputCluster([gene_cluster_nu_filepath], file_path, cut_branch_threshold, treefile_used) else: ## record the misclusters to be deleted (already addressed in cutTree_outputCluster ) ## it will output the same cluster several times #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file: # delete_cluster_file.write('%s\n'%uncluster_filename) ## add record in new_clusters_longSplit.txt, which is used for align new clusters new_fa_files.add(gene_cluster_nu_filepath) ## write cluster statistics in folder update_long_branch_splits addin_geneCluster_dt = defaultdict(list) addin_geneCluster_dt[newClusterId] = [0, [], 0] ## num_stains addin_geneCluster_dt[newClusterId][0] = len( dict(Counter([ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes addin_geneCluster_dt[newClusterId][2] = len( dict(Counter([ig for ig in split_gene_list])).keys()) ## gene members addin_geneCluster_dt[newClusterId][1] = [ ig.split('-')[0] for ig in split_gene_list ] ## cPickle new cluster statistics write_pickle( ''.join([ file_path, 'update_long_branch_splits/', newClusterId, '.cpk' ]), addin_geneCluster_dt) ## write records in gene_diversity file with open(file_path + 'new_clusters_longSplit.txt', 'a') as refined_cluster_file: for i in new_fa_files: refined_cluster_file.write('%s\n' % i)
def geneCluster_to_json(path, enable_RNA_clustering, store_locus_tag, raw_locus_tag, optional_table_column): """ create json file for gene cluster table visualzition input: path to genecluster output output: geneCluster.json """ # define path and make output directory geneCluster_path = '%s%s' % (path, 'geneCluster/') output_path = '%s%s' % (path, 'vis/') # open files geneClusterJSON_outfile = open(output_path + 'geneCluster.json', 'wb') ##store locus_tags in a separate file for large dataset if store_locus_tag: locus_tag_outfile = open(path + 'search_locus_tag.tsv', 'wb') ### load precomputed annotations, diversity, associations etc # load geneID_to_descriptions geneID_to_descriptions = load_pickle(path + 'geneID_to_description.cpk') if enable_RNA_clustering: # load RNAID_to_description_file geneID_to_descriptions.update( load_pickle(path + 'RNAID_to_description.cpk')) gene_diversity_Dt = load_pickle(geneCluster_path + 'gene_diversity.cpk') ## load gain/loss event count dictionary dt_geneEvents = load_pickle(geneCluster_path + 'dt_geneEvents.cpk') ## load association branch_associations_path = path + 'branch_association.cpk' if os.path.isfile(branch_associations_path): branch_associations = load_pickle(branch_associations_path) else: branch_associations = {} presence_absence_associations_path = path + 'presence_absence_association.cpk' if os.path.isfile(presence_absence_associations_path): presence_absence_associations = load_pickle( presence_absence_associations_path) else: presence_absence_associations = {} ## load list of clustered sorted by strain count sorted_genelist = load_sorted_clusters(path) geneClusterJSON_outfile.write('[') ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for gid, (clusterID, gene) in enumerate(sorted_genelist): strain_count, gene_list, gene_count = gene # #print strain_count, gene_count if gid != 0: ## begin geneClusterJSON_outfile.write(',\n') ## annotation majority allAnn, majority_annotation = consolidate_annotation( path, gene_list, geneID_to_descriptions) ## geneName majority all_geneName, majority_geneName = consolidate_geneName( path, gene_list, geneID_to_descriptions) ## extract gain/loss event count gene_event = dt_geneEvents[gid] ## average length seqs = read_fasta(geneCluster_path + '%s%s' % (clusterID, '.fna')).values() geneClusterLength = int(np.mean([len(igene) for igene in seqs])) ## msa #geneCluster_aln='%s%s'%(clusterID,'_aa.aln') geneCluster_aln = clusterID ## check for duplicates if gene_count > strain_count: duplicated_state = 'yes' dup_list = [ig.split('|')[0] for ig in gene_list] # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes # Counter({'g1': 2, 'g2': 1}) dup_detail = ''.join([ '%s#%s@' % (kd, vd) for kd, vd in Counter(dup_list).iteritems() if vd > 1 ])[:-1] else: duplicated_state = 'no' dup_detail = '' ## locus_tag if raw_locus_tag: # make a string of all locus tags [1] in igl.split('|') all_locus_tags = ' '.join([igl.split('|')[1] for igl in gene_list]) else: # in addition to locus tag, keep strain name (but replace '|') all_locus_tags = ' '.join( [igl.replace('|', '_') for igl in gene_list]) ## optionally store locus tags to file, remove from geneClusterJSON if store_locus_tag: locus_tag_outfile.write('%s\t%s\n' % (clusterID, all_locus_tags)) all_locus_tags = '' ## default cluster json fields cluster_json_line = [ '"geneId":' + str(gid + 1), '"geneLen":' + str(geneClusterLength), '"count":' + str(strain_count), '"dupli":"' + duplicated_state + '"', '"dup_detail":"' + dup_detail + '"', '"ann":"' + majority_annotation + '"', '"msa":"' + geneCluster_aln + '"', '"divers":"' + gene_diversity_Dt[clusterID] + '"', '"event":"' + str(gene_event) + '"', '"allAnn":"' + allAnn + '"', '"GName":"' + majority_geneName + '"', '"allGName":"' + all_geneName + '"', '"locus":"' + all_locus_tags + '"' ] if optional_table_column: cluster_json_line.extend( optional_geneCluster_properties(gene_list, optional_table_column)) if clusterID in branch_associations: cluster_json_line.extend( geneCluster_associations(branch_associations[clusterID], suffix='BA')) if clusterID in presence_absence_associations: cluster_json_line.extend( geneCluster_associations( presence_absence_associations[clusterID], suffix='PA')) #write file cluster_json_line = ','.join(cluster_json_line) geneClusterJSON_outfile.write('{' + cluster_json_line + '}') # close files geneClusterJSON_outfile.write(']') geneClusterJSON_outfile.close() if store_locus_tag: locus_tag_outfile.close()
def create_split_cluster_files(file_path, fname, gene_list1, gene_list2, geneCluster_dt): """ delete the old cluster and create two new clusters params: new_fa_files: list to which new file names are appeneded gene_list1/2: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated """ orgin_nwk_name = fname.split('/')[-1] clusterID = orgin_nwk_name.replace('.nwk','') origin_cluster_nu_fa = orgin_nwk_name.replace('nwk','fna') origin_cluster_aa_fa = orgin_nwk_name.replace('nwk','faa') split_fa_files_set=set() ## load genes from old clusters origin_nu_fa_dt = read_fasta(file_path+origin_cluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path+origin_cluster_aa_fa) sgs_index=0 ## delete old (split) clusters try: #print('deleting:',orgin_nwk_name) ##debug: ##print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID) del geneCluster_dt[clusterID] with open(file_path+'old_clusters_paralogSplit.txt', 'a') as delete_cluster_file: delete_cluster_file.write('%s\n'%clusterID) if os.path.exists(fname): suffix_list=['_aa_aln.fa','_na_aln.fa','.fna','.faa','.nwk','_tree.json'] else: suffix_list=['_aa_aln.fa','_na_aln.fa','.fna','.faa'] tmp_files=' '.join([ file_path+clusterID+suffix for suffix in suffix_list ]) command_move_deleted_clusters=' '.join(['mv', tmp_files, file_path+'paralog_splits/']) os.system(command_move_deleted_clusters) except: print("paralog splitting: can't delete",orgin_nwk_name) ##debug: ##print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID) ## write new cluster fa files ## split_gene_list has geneSeqID instead of geneID for split_gene_list in (list(gene_list1), list(gene_list2)): sgs_index+=1 newClusterId="%s_p%s"%(clusterID,sgs_index) gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna') gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa') gene_cluster_nu_write=open( file_path+gene_cluster_nu_filename, 'wb') gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb') split_fa_files_set |= set([file_path+gene_cluster_nu_filename]) ## write new split cluster files for gene_memb in split_gene_list: if "\\'" in gene_memb: gene_memb=gene_memb.replace("\\'","'") try: write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) except: print 'paralogy splitting (problem to write new split cluster files)', fname #, gene_memb, gene_list1, gene_list2 gene_cluster_nu_write.close(); gene_cluster_aa_write.close(); geneCluster_dt[ newClusterId ] = [0,[],0] ## num_stains geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys()) ## gene members geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ] return split_fa_files_set
def output_cutted_clusters(file_path, uncluster_filename, gene_list, cut_branch_threshold, treefile_used=None, cut_leftover=None): """ delete the unclustered file and create new clusters params: gene_list: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated cut_leftover: flag to indicate whether there are the leftover nodes after cutting long branches. Default: empty. """ clusterID = uncluster_filename.replace('.fna','') origin_uncluster_nu_fa = uncluster_filename origin_uncluster_aa_fa = uncluster_filename.replace('fna','faa') new_fa_files=set() ## load origin cluster fa files origin_nu_fa_dt = read_fasta(file_path+origin_uncluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path+origin_uncluster_aa_fa) ## split_gene_list has geneSeqID instead of geneID for sgs_index,split_gene_list in enumerate(gene_list,1): if cut_leftover==True: ## newClusterId for the rest genes (_r as identifier) newClusterId="%s_r%s"%(clusterID,sgs_index) else: newClusterId="%s_%s"%(clusterID,sgs_index) #============================================= ## write new divided/split cluster files gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna') gene_cluster_nu_filepath= file_path+gene_cluster_nu_filename gene_cluster_nu_write=open(gene_cluster_nu_filepath , 'wb') gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa') gene_cluster_aa_filepath= file_path+gene_cluster_aa_filename gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb') for gene_memb in split_gene_list: if "\\'" in gene_memb: # Replace '\' in node name: ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError: ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'" gene_memb=gene_memb.replace("\\'","'") write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) gene_cluster_nu_write.close(); gene_cluster_aa_write.close(); #============================================= if cut_leftover==True: ## align the rest genes, build tree, cut long branches till nothing can be cutted. cutTree_outputCluster([gene_cluster_nu_filepath],file_path, cut_branch_threshold, treefile_used) else: ## record the misclusters to be deleted (already addressed in cutTree_outputCluster ) ## it will output the same cluster several times #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file: # delete_cluster_file.write('%s\n'%uncluster_filename) ## add record in new_clusters_longSplit.txt, which is used for align new clusters new_fa_files.add(gene_cluster_nu_filepath) ## write cluster statistics in folder update_long_branch_splits addin_geneCluster_dt=defaultdict(list) addin_geneCluster_dt[ newClusterId ] = [0,[],0] ## num_stains addin_geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes addin_geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys()) ## gene members addin_geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ] ## cPickle new cluster statistics write_pickle(''.join([file_path,'update_long_branch_splits/', newClusterId,'.cpk']),addin_geneCluster_dt) ## write records in gene_diversity file with open(file_path+'new_clusters_longSplit.txt', 'a') as refined_cluster_file: for i in new_fa_files: refined_cluster_file.write('%s\n'%i)
def extract_sequences(path, strain_list, folders_dict, gbk_present, enable_RNA_clustering): ''' go through all GenBank files and extract sequences and metadata for each one ''' gbk_path= folders_dict['gbk_path'] protein_path= folders_dict['protein_path'] nucleotide_path= folders_dict['nucleotide_path'] RNA_path= folders_dict['RNA_path'] geneID_to_geneSeqID_file= '%sgeneID_to_geneSeqID.cpk'%path geneID_to_description_file= '%sgeneID_to_description.cpk'%path RNAID_to_SeqID_file= '%sRNAID_to_SeqID.cpk'%path RNAID_to_description_file= '%sRNAID_to_description.cpk'%path protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk') nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk') RNA_dict_path= '%s%s'%(RNA_path,'all_RNA_seq.cpk') geneID_to_geneSeqID_dict= defaultdict() geneID_to_description_dict= defaultdict() RNAID_to_SeqID_dict= defaultdict() RNAID_to_description_dict= defaultdict() gene_aa_dict= defaultdict(dict) gene_na_dict= defaultdict(dict) RNA_dict= defaultdict(dict) if gbk_present: ## clean up folder when data from previous run exist. os.system('rm -rf '+protein_path+'*.faa') os.system('rm -rf '+nucleotide_path+'*.fna') missing_CDS_list=[] ## a list containing strains which have no CDS (if any) ## process gbk file for strainID in strain_list: gbk_fname= ''.join([gbk_path,strainID,'.gbk']) protein_fname= ''.join([protein_path,strainID,'.faa']) nucleotide_fname= ''.join([nucleotide_path,strainID,'.fna']) RNA_fname= ''.join([RNA_path,strainID,'.fna']) check_CDS_passed= gbk_translation(strainID, gbk_fname, protein_fname, nucleotide_fname, RNA_fname, geneID_to_geneSeqID_dict,geneID_to_description_dict, RNAID_to_SeqID_dict, RNAID_to_description_dict, gene_aa_dict, gene_na_dict, RNA_dict, enable_RNA_clustering) if not check_CDS_passed: missing_CDS_list.append(strainID) if len(missing_CDS_list)!=0: print 'Warning: no CDS found in the following genome/genomes, please double-check\n', missing_CDS_list exit() else: ## process fna/faa files if gbk files are not given. for strainID in strain_list: ## amino acid sequences protein_fname=''.join([protein_path,strainID,'.faa']) nucleotide_fname=''.join([nucleotide_path,strainID,'.fna']) aa_sequence_dt=read_fasta(protein_fname) na_sequence_dt=read_fasta(nucleotide_fname) ## prepare geneSeqID and description for geneID in aa_sequence_dt.keys(): geneName, annotation= '','' geneID_to_geneSeqID_dict[geneID]=geneID geneID_to_description_dict[geneID]={'geneName': geneName, 'annotation': annotation} gene_aa_dict[strainID][geneID]=aa_sequence_dt[geneID] gene_na_dict[strainID][geneID]=na_sequence_dt[geneID] write_pickle(geneID_to_geneSeqID_file, geneID_to_geneSeqID_dict) write_pickle(geneID_to_description_file, geneID_to_description_dict) write_pickle(protein_dict_path,gene_aa_dict) write_pickle(nucleotide_dict_path,gene_na_dict) ## option: process RNA sequences for RNA_clustering if enable_RNA_clustering: write_pickle(RNA_dict_path,RNA_dict) write_pickle(RNAID_to_SeqID_file, RNAID_to_SeqID_dict) write_pickle(RNAID_to_description_file, RNAID_to_description_dict) return gene_aa_dict, gene_na_dict
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species): """ estimate core gene diversity before gene cluster alignment and cluster post-processing """ totalStrain= len(strain_list) ## load clusters clustering_path= folders_dict['clustering_path'] geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk') protein_path= folders_dict['protein_path'] nucleotide_path= folders_dict['nucleotide_path'] protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk') nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk') tmp_core_seq_path= '%s%s'%(clustering_path,'tmp_core/') ## load geneID_to_geneSeqID geneSeqID cpk file geneID_to_geneSeqID_dict= load_pickle(path+'geneID_to_geneSeqID.cpk') ## create core gene list core_geneCluster_dt= defaultdict() # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes } for clusterID, cluster_stats in geneCluster_dt.iteritems(): if core_cutoff==1.0: strain_core_cutoff=totalStrain else: strain_core_cutoff=int(totalStrain*core_cutoff) ## check whether #genes == #strains and it's a core/soft-core gene if cluster_stats[0]==cluster_stats[2] and cluster_stats[0]>=strain_core_cutoff: core_geneCluster_dt[clusterID]=cluster_stats if os.path.exists(tmp_core_seq_path): os.system(''.join(['rm -rf ',tmp_core_seq_path])) os.system('mkdir %s'%tmp_core_seq_path) ## create dict storing all genes' translation if 0: gene_aa_dict= defaultdict(dict) for accession_id in strain_list: gene_aa_dict[accession_id]= read_fasta(''.join([protein_path,accession_id,'.faa'])) write_pickle(protein_dict_path, gene_aa_dict) ## create dict for all gene's nucleotide sequence gene_na_dict= defaultdict(dict) for accession_id in strain_list: gene_na_dict[accession_id]=read_fasta(''.join([nucleotide_path,accession_id,'.fna'])) write_pickle(nucleotide_dict_path, gene_na_dict) gene_aa_dict= load_pickle(protein_dict_path) gene_na_dict= load_pickle(nucleotide_dict_path) ## write nucleotide and amino-acid sequences for each gene cluster export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict, gene_na_dict, gene_aa_dict) tmp_fa_files=glob.glob(tmp_core_seq_path+"*.fna") multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species) calculated_core_diversity=tmp_average_core_diversity(tmp_core_seq_path) refined_core_diversity= round((0.1+factor_core_diversity*calculated_core_diversity)/(1+factor_core_diversity*calculated_core_diversity),4) print('factor used: '+str(factor_core_diversity)) print('average core genome diversity: '+str(calculated_core_diversity)) print('defined core genome diversity cutoff for splitting long branches: '+str(refined_core_diversity)) ## move folder tmp_core to the central data folder new_clustering_path= '%stmp_core'%path if os.path.exists(new_clustering_path): os.system(''.join(['rm -r ',new_clustering_path])) os.system('mv %s %s'%(tmp_core_seq_path, path)) return calculated_core_diversity, refined_core_diversity
def geneCluster_to_json(path, enable_RNA_clustering, store_locus_tag, raw_locus_tag, optional_table_column): """ create json file for gene cluster table visualzition input: path to genecluster output output: geneCluster.json """ # define path and make output directory geneCluster_path='%s%s'%(path,'geneCluster/') output_path='%s%s'%(path,'vis/') # open files geneClusterJSON_outfile=open(output_path+'geneCluster.json', 'wb') ##store locus_tags in a separate file for large dataset if store_locus_tag: locus_tag_outfile=open(path+'search_locus_tag.tsv', 'wb') ### load precomputed annotations, diversity, associations etc # load geneID_to_descriptions geneID_to_descriptions=load_pickle(path+'geneID_to_description.cpk') if enable_RNA_clustering: # load RNAID_to_description_file geneID_to_descriptions.update(load_pickle(path+'RNAID_to_description.cpk')) gene_diversity_Dt = load_pickle(geneCluster_path+'gene_diversity.cpk') ## load gain/loss event count dictionary dt_geneEvents = load_pickle(geneCluster_path+'dt_geneEvents.cpk') ## load association branch_associations_path = path+'branch_association.cpk' if os.path.isfile(branch_associations_path): branch_associations = load_pickle(branch_associations_path) else: branch_associations={} presence_absence_associations_path = path+'presence_absence_association.cpk' if os.path.isfile(presence_absence_associations_path): presence_absence_associations = load_pickle(presence_absence_associations_path) else: presence_absence_associations={} ## load list of clustered sorted by strain count sorted_genelist = load_sorted_clusters(path) geneClusterJSON_outfile.write('[') ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for gid, (clusterID, gene) in enumerate(sorted_genelist): strain_count, gene_list, gene_count = gene # #print strain_count, gene_count if gid!=0: ## begin geneClusterJSON_outfile.write(',\n') ## annotation majority allAnn, majority_annotation = consolidate_annotation(path, gene_list, geneID_to_descriptions) ## geneName majority all_geneName, majority_geneName = consolidate_geneName(path, gene_list, geneID_to_descriptions) ## extract gain/loss event count gene_event= dt_geneEvents[gid] ## average length seqs = read_fasta(geneCluster_path+'%s%s'%(clusterID,'.fna')).values() geneClusterLength = int(np.mean([ len(igene) for igene in seqs])) ## msa #geneCluster_aln='%s%s'%(clusterID,'_aa.aln') geneCluster_aln=clusterID ## check for duplicates if gene_count>strain_count: duplicated_state='yes' dup_list=[ ig.split('|')[0] for ig in gene_list] # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes # Counter({'g1': 2, 'g2': 1}) dup_detail=''.join(['%s#%s@'%(kd,vd) for kd, vd in Counter(dup_list).iteritems() if vd>1 ])[:-1] else: duplicated_state='no';dup_detail='' ## locus_tag if raw_locus_tag: # make a string of all locus tags [1] in igl.split('|') all_locus_tags=' '.join([ igl.split('|')[1] for igl in gene_list ]) else: # in addition to locus tag, keep strain name (but replace '|') all_locus_tags=' '.join([ igl.replace('|','_') for igl in gene_list ]) ## optionally store locus tags to file, remove from geneClusterJSON if store_locus_tag: locus_tag_outfile.write('%s\t%s\n'%(clusterID,all_locus_tags)) all_locus_tags='' ## default cluster json fields cluster_json_line=['"geneId":'+str(gid+1), '"geneLen":'+str(geneClusterLength), '"count":'+str(strain_count), '"dupli":"'+duplicated_state+'"', '"dup_detail":"'+dup_detail+'"', '"ann":"'+majority_annotation+'"', '"msa":"'+geneCluster_aln+'"', '"divers":"'+gene_diversity_Dt[clusterID]+'"', '"event":"'+str(gene_event)+'"', '"allAnn":"'+allAnn+'"', '"GName":"'+majority_geneName+'"', '"allGName":"'+all_geneName+'"', '"locus":"'+all_locus_tags+'"' ] if optional_table_column: cluster_json_line.extend(optional_geneCluster_properties(gene_list,optional_table_column)) if clusterID in branch_associations: cluster_json_line.extend(geneCluster_associations(branch_associations[clusterID], suffix='BA')) if clusterID in presence_absence_associations: cluster_json_line.extend(geneCluster_associations(presence_absence_associations[clusterID], suffix='PA')) #write file cluster_json_line=','.join(cluster_json_line) geneClusterJSON_outfile.write('{'+cluster_json_line+'}') # close files geneClusterJSON_outfile.write(']') geneClusterJSON_outfile.close() if store_locus_tag: locus_tag_outfile.close()