def build_representative_cluster(clustering_path, threads, input_prefix): """ build representative cluster """ start = time.time() cluster_file = ''.join([clustering_path, input_prefix, '_cluster.output']) representative_outputfile = ''.join( [clustering_path, input_prefix, '_representative', '.faa']) subproblem_seqs_path = '%ssubproblem_cluster_seqs/' % clustering_path subproblem_merged_faa = ''.join([clustering_path, input_prefix, '.faa']) subproblem_faa_dict = read_fasta(subproblem_merged_faa) with open(cluster_file, 'rb') as cluster_input: subproblem_geneCluster_dt = defaultdict(list) cluster_input_lines = [iline for iline in cluster_input] subproblem_geneCluster_dt = {} subproblem_run_number = input_prefix.split('subproblem_')[1] for gid, iline in enumerate(cluster_input_lines): #cluster_input ## use time to avoid clusterID conflict clusterID = "GCs%s_%07d%s" % (subproblem_run_number, gid, time.strftime('%M%S', time.gmtime())) gene_ids = iline.rstrip().split('\t') subproblem_geneCluster_dt[clusterID] = gene_ids ## representative_seq representative_seq = subproblem_faa_dict[gene_ids[0]] ## write in representative strain with open(representative_outputfile, 'a') as representative_output: write_in_fa(representative_output, clusterID, representative_seq) ## write subproblem_geneCluster_dt write_pickle(''.join([clustering_path, input_prefix, '_dicts.cpk']), subproblem_geneCluster_dt) print 'build representative clusters for', input_prefix, ': ', times( start), '\n'
def create_RNACluster_fa(path, folders_dict): """ input: '.fna', '_RNA_nuc_dict.cpk', 'allclusters.cpk' output: '.aln', 'tree.json', etc """ RNA_path = folders_dict['RNA_path'] RNA_dict = load_pickle('%s%s' % (RNA_path, 'all_RNA_seq.cpk')) ## load RNA cluster cpk file diamond_RNACluster_dt = load_pickle(RNA_path + 'allclusters.cpk') ## load RNAID_to_RNASeqID RNASeqID cpk file RNAID_to_RNASeqID_dict = load_pickle(path + 'RNAID_to_SeqID.cpk') ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder) fasta_path = path + 'geneCluster/' ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs } for clusterID, RNA in diamond_RNACluster_dt.iteritems(): ## RNACluster file name RNA_cluster_nu_filename = "%s%s" % (clusterID, '.fna') RNA_cluster_nu_write = open(fasta_path + RNA_cluster_nu_filename, 'wb') ## write nucleotide/amino_acid sequences into RNACluster files for RNA_memb in RNA[1]: ## RNA_name format: strain_1|locusTag strain_name = RNA_memb.split('|')[0] RNA_memb_seq = str(RNA_dict[strain_name][RNA_memb]) RNASeqID = RNAID_to_RNASeqID_dict[RNA_memb] write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq) RNA_cluster_nu_write.close() return diamond_RNACluster_dt
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path, simple_tree): fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree' for RNA_cluster_nu_filename in fa_files_list: try: # extract GC_RNA002 from path/GC_RNA002.aln clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0] geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a') if len( read_fasta(RNA_cluster_nu_filename) )==1: # nothing to do for singletons ## na.aln RNA_cluster_nu_aln_filename= RNA_cluster_nu_filename.replace('.fna','_na.aln') ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility with open(RNA_cluster_nu_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(RNA_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0')) else: # align and build tree print RNA_cluster_nu_filename myTree = mpm_tree(RNA_cluster_nu_filename) myTree.align() if simple_tree==False: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True) myTree.ancestral(translate_tree=True) myTree.refine(CDS=False) else: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False) myTree.diversity_statistics_nuc() myTree.export(path=alignFile_path, RNA_specific=True) RNA_diversity_values='{0:.3f}'.format(myTree.diversity_nuc) geneDiversity_file.write('%s\t%s\n'%(clusterID,RNA_diversity_values)) print clusterID,RNA_diversity_values except: print("Aligning and tree building of RNA %s failed"%RNA_cluster_nu_filename)
def create_geneCluster_fa(path,folders_dict): """ dict storing amino_acid Id/Seq from '.faa' files input: '.faa', '_gene_nuc_dict.cpk', 'allclusters.cpk' output: """ ## make sure the geneCluster folder is empty os.system('rm -rf %s'%(path+'geneCluster/')) clustering_path= folders_dict['clustering_path'] geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk') protein_path= folders_dict['protein_path'] nucleotide_path= folders_dict['nucleotide_path'] geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk') gene_aa_dict= load_pickle('%s%s'%(protein_path,'all_protein_seq.cpk')) gene_na_dict= load_pickle('%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk')) ## create cluster-genes fasta files cluster_seqs_path=path+'geneCluster/' os.system('mkdir '+cluster_seqs_path) ## write nuc/aa sequences for each cluster for clusterID, gene in geneCluster_dt.iteritems(): ## geneCluster file name gene_cluster_nu_filename="%s%s"%(clusterID,'.fna') gene_cluster_aa_filename="%s%s"%(clusterID,'.faa') with open( cluster_seqs_path+gene_cluster_nu_filename, 'wb') as gene_cluster_nu_write, \ open( cluster_seqs_path+gene_cluster_aa_filename, 'wb') as gene_cluster_aa_write: ## write nucleotide/amino_acid sequences into geneCluster files for gene_memb in gene[1]: ## gene_name format: strain_1|locusTag strain_name= gene_memb.split('|')[0] geneSeqID=geneID_to_geneSeqID_dict[gene_memb] write_in_fa(gene_cluster_nu_write, geneSeqID, gene_na_dict[strain_name][gene_memb] ) write_in_fa(gene_cluster_aa_write, geneSeqID, gene_aa_dict[strain_name][gene_memb])
def create_RNACluster_fa(path,folders_dict): """ input: '.fna', '_RNA_nuc_dict.cpk', 'allclusters.cpk' output: '.aln', 'tree.json', etc """ RNA_path= folders_dict['RNA_path'] RNA_dict= load_pickle('%s%s'%(RNA_path,'all_RNA_seq.cpk')) ## load RNA cluster cpk file diamond_RNACluster_dt=load_pickle(RNA_path+'allclusters.cpk') ## load RNAID_to_RNASeqID RNASeqID cpk file RNAID_to_RNASeqID_dict=load_pickle(path+'RNAID_to_SeqID.cpk') ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder) fasta_path=path+'geneCluster/'; ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs } for clusterID, RNA in diamond_RNACluster_dt.iteritems(): ## RNACluster file name RNA_cluster_nu_filename="%s%s"%(clusterID,'.fna') RNA_cluster_nu_write=open( fasta_path+RNA_cluster_nu_filename, 'wb') ## write nucleotide/amino_acid sequences into RNACluster files for RNA_memb in RNA[1]: ## RNA_name format: strain_1|locusTag strain_name= RNA_memb.split('|')[0] RNA_memb_seq=str(RNA_dict[strain_name][RNA_memb]) RNASeqID=RNAID_to_RNASeqID_dict[RNA_memb] write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq ) RNA_cluster_nu_write.close() return diamond_RNACluster_dt
def build_representative_cluster(clustering_path, threads, input_prefix): """ build representative cluster """ start = time.time() cluster_file= ''.join([clustering_path,input_prefix,'_cluster.output']) representative_outputfile= ''.join([clustering_path,input_prefix,'_representative','.faa']) subproblem_seqs_path= '%ssubproblem_cluster_seqs/'%clustering_path subproblem_merged_faa= ''.join([clustering_path,input_prefix,'.faa']) subproblem_faa_dict= read_fasta(subproblem_merged_faa) with open(cluster_file, 'rb') as cluster_input: subproblem_geneCluster_dt= defaultdict(list) cluster_input_lines= [iline for iline in cluster_input] subproblem_geneCluster_dt= {} subproblem_run_number= input_prefix.split('subproblem_')[1] for gid, iline in enumerate(cluster_input_lines):#cluster_input ## use time to avoid clusterID conflict clusterID= "GCs%s_%07d%s"%(subproblem_run_number, gid, time.strftime('%M%S',time.gmtime())) gene_ids= iline.rstrip().split('\t') subproblem_geneCluster_dt[clusterID]= gene_ids ## representative_seq representative_seq=subproblem_faa_dict[gene_ids[0]] ## write in representative strain with open(representative_outputfile, 'a') as representative_output: write_in_fa(representative_output, clusterID, representative_seq) ## write subproblem_geneCluster_dt write_pickle(''.join([clustering_path,input_prefix,'_dicts.cpk']), subproblem_geneCluster_dt) print 'build representative clusters for', input_prefix,': ', times(start), '\n'
def align_and_makeTree( fna_file_list, alignFile_path, simple_tree): fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree' for gene_cluster_nu_filename in fna_file_list: try: # extract GC_00002 from path/GC_00002.aln clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0] start = time.time(); geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a') if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons ## na_aln.fa gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na_aln.fa') ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility with open(gene_cluster_nu_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) os.system( ' '.join(['cp',gene_cluster_nu_aln_filename,gene_cluster_nu_aln_filename.replace('_aln','_aln_reduced')]) ) ## aa_aln.fa gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa') gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa_aln.fa') ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility with open(gene_cluster_aa_aln_filename,'wb') as write_file: for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|','-'), Sequence) os.system( ' '.join(['cp',gene_cluster_aa_aln_filename,gene_cluster_aa_aln_filename.replace('_aln','_aln_reduced')]) ) geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0')) else: # align and build tree #print gene_cluster_nu_filename myTree = mpm_tree(gene_cluster_nu_filename) myTree.codon_align() myTree.translate() if simple_tree==False: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True) myTree.ancestral(translate_tree=True) myTree.refine() else: myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False) myTree.diversity_statistics_nuc() myTree.export(path=alignFile_path) #myTree.diversity_statistics_aa() #random_alnID=myTree.seqs.keys()[0].split('-')[0] diversity_nuc= round(myTree.diversity_nuc,3)#diversity_aa=round(myTree.diversity_aa,3) #bestSplit_paraNodes,bestSplit_branchLen = myTree.paralogy_statistics() #mean_seqLen, std_seqLen= myTree.mean_std_seqLen() #mean_seqLen, std_seqLen= [ round(i,3) for i in mean_seqLen, std_seqLen ] geneDiversity_file.write('%s\t%s\n'%(clusterID,diversity_nuc)) if 0: cluster_correl_stats_file = open(alignFile_path+'cluster_correl_stats.txt', 'a') cluster_correl_stats_file.write('%s\n'%'\t'.join([ str(i) for i in [clusterID, random_alnID, diversity_nuc, \ mean_seqLen, std_seqLen, bestSplit_paraNodes, bestSplit_branchLen ] ])) except: print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/') output_path = '%s%s' % (path, 'geneCluster/') ## load strain list and prepare for gene presence/absence strain_list = load_pickle('%s%s' % (path, 'strain_list.cpk')) set_totalStrain = set([istrain for istrain in strain_list]) totalStrain = len(set_totalStrain) dt_strainGene = defaultdict(str) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for clusterID, gene in sorted_genelist: ## append 0/1 to each strain create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1]) with open('%s%s' % (output_path, 'genePresence.aln'), 'wb') as presence_outfile: for istkey in dt_strainGene: write_in_fa(presence_outfile, istkey, dt_strainGene[istkey]) write_pickle('%s%s' % (output_path, 'dt_genePresence.cpk'), dt_strainGene) if disable_gain_loss: geneEvents_dt = {i: 0 for i in range(len(sorted_genelist))} write_pickle('%s%s' % (output_path, 'dt_geneEvents.cpk'), geneEvents_dt) if merged_gain_loss_output: gene_loss_fname = '%s%s' % (output_path, 'geneGainLossEvent.json') write_json(dt_strainGene, gene_loss_fname, indent=1) else: ## strainID as key, presence pattern as value (converted into np.array) keylist = dt_strainGene.keys() keylist.sort() strainID_keymap = {ind: k for ind, k in enumerate(keylist) } # dict(zip(keylist, range(3))) presence_arr = np.array([ np.array(dt_strainGene[k], 'c') for k in keylist ]) # 0: present, 3: absent presence_arr[presence_arr == '1'] = '3' for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_dt = { strainID_keymap[strain_ind]: str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind]) } pattern_fname = '%s%s_patterns.json' % (output_path, clusterID) write_json(pattern_dt, pattern_fname, indent=1)
def export_cluster_seq_tmp(cluster_seqs_path, geneCluster_dt, geneID_to_geneSeqID_dict, gene_na_dict, gene_aa_dict): """ write nuc/aa sequences for each cluster """ for clusterID, gene in geneCluster_dt.iteritems(): ## geneCluster file name gene_cluster_nu_filename="%s%s"%(clusterID,'.fna') with open( cluster_seqs_path+gene_cluster_nu_filename, 'wb') as gene_cluster_nu_write: ## write nucleotide sequences into geneCluster files for gene_memb in gene[1]: ## gene_name format: strain_1|locusTag strain_name= gene_memb.split('|')[0] geneSeqID=geneID_to_geneSeqID_dict[gene_memb] write_in_fa(gene_cluster_nu_write, geneSeqID, gene_na_dict[strain_name][gene_memb] )
def concatenate_core_gene_alignments(input_path, output_path): core_genes_dt=defaultdict(str) with open(input_path+'/geneCluster/core_geneList.txt') as core_list: # all core gene alignments in FASTA files for gene in core_list: gene_path= input_path+'/vis/geneCluster/'+gene.rstrip()+'.gz' with gzip.open(gene_path, 'rb') as zip_file: for record in SeqIO.parse(zip_file, "fasta"): #NC_018495-CM9_RS00390-1-hypothetical_protein accession=record.id.split('-')[0] core_genes_dt[accession]= '%s%s'%(core_genes_dt[accession], record.seq) with open(output_path,'wb') as output_file: for gene_id, gene_seq in core_genes_dt.iteritems(): write_in_fa(output_file, gene_id, gene_seq)
def export_cluster_seq_tmp(cluster_seqs_path, geneCluster_dt, geneID_to_geneSeqID_dict, gene_na_dict, gene_aa_dict): """ write nuc/aa sequences for each cluster """ for clusterID, gene in geneCluster_dt.iteritems(): ## geneCluster file name gene_cluster_nu_filename = "%s%s" % (clusterID, '.fna') with open(cluster_seqs_path + gene_cluster_nu_filename, 'wb') as gene_cluster_nu_write: ## write nucleotide sequences into geneCluster files for gene_memb in gene[1]: ## gene_name format: strain_1|locusTag strain_name = gene_memb.split('|')[0] geneSeqID = geneID_to_geneSeqID_dict[gene_memb] write_in_fa(gene_cluster_nu_write, geneSeqID, gene_na_dict[strain_name][gene_memb])
def concatenate_core_gene_alignments(input_path, output_path): core_genes_dt = defaultdict(str) with open(input_path + '/geneCluster/core_geneList.txt') as core_list: # all core gene alignments in FASTA files for gene in core_list: gene_path = input_path + '/vis/geneCluster/' + gene.rstrip( ) + '.gz' with gzip.open(gene_path, 'rb') as zip_file: for record in SeqIO.parse(zip_file, "fasta"): #NC_018495-CM9_RS00390-1-hypothetical_protein accession = record.id.split('-')[0] core_genes_dt[accession] = '%s%s' % ( core_genes_dt[accession], record.seq) with open(output_path, 'wb') as output_file: for gene_id, gene_seq in core_genes_dt.iteritems(): write_in_fa(output_file, gene_id, gene_seq)
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path, simple_tree): fasttree_name = 'fasttree' if check_dependency('fasttree') else 'FastTree' for RNA_cluster_nu_filename in fa_files_list: if 1: #try: # extract GC_RNA002 from path/GC_RNA002.aln clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0] geneDiversity_file = open(alignFile_path + 'gene_diversity.txt', 'a') if len(read_fasta(RNA_cluster_nu_filename) ) == 1: # nothing to do for singletons ## na.aln RNA_cluster_nu_aln_filename = RNA_cluster_nu_filename.replace( '.fna', '_na.aln') ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility with open(RNA_cluster_nu_aln_filename, 'wb') as write_file: for SeqID, Sequence in read_fasta( RNA_cluster_nu_filename).iteritems(): write_in_fa(write_file, SeqID.replace('|', '-'), Sequence) geneDiversity_file.write('%s\t%s\n' % (clusterID, '0.0')) else: # align and build tree print RNA_cluster_nu_filename myTree = mpm_tree(RNA_cluster_nu_filename) myTree.align() if simple_tree == False: myTree.build(raxml=False, fasttree_program=fasttree_name, treetime_used=True) myTree.ancestral(translate_tree=True) myTree.refine() else: myTree.build(raxml=False, fasttree_program=fasttree_name, treetime_used=False) myTree.diversity_statistics_nuc() myTree.export(path=alignFile_path, RNA_specific=True) RNA_diversity_values = '{0:.3f}'.format(myTree.diversity_nuc) geneDiversity_file.write('%s\t%s\n' % (clusterID, RNA_diversity_values)) print clusterID, RNA_diversity_values if 0: #except: print("Aligning and tree building of RNA %s failed" % RNA_cluster_nu_filename)
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath='%s%s'%(path,'protein_fna/diamond_matches/') output_path='%s%s'%(path,'geneCluster/'); ## load strain list and prepare for gene presence/absence strain_list= load_pickle('%s%s'%(path,'strain_list.cpk')) set_totalStrain=set([ istrain for istrain in strain_list ]) totalStrain=len(set_totalStrain) dt_strainGene= defaultdict(str) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for clusterID, gene in sorted_genelist: ## append 0/1 to each strain create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1]) with open('%s%s'%(output_path,'genePresence.aln'),'wb') as presence_outfile: for istkey in dt_strainGene: write_in_fa( presence_outfile, istkey, dt_strainGene[istkey]) write_pickle('%s%s'%(output_path,'dt_genePresence.cpk'), dt_strainGene) if disable_gain_loss: geneEvents_dt={ i:0 for i in range(len(sorted_genelist)) } write_pickle('%s%s'%(output_path,'dt_geneEvents.cpk'), geneEvents_dt) if merged_gain_loss_output: gene_loss_fname='%s%s'%(output_path,'geneGainLossEvent.json') write_json(dt_strainGene, gene_loss_fname, indent=1) else: ## strainID as key, presence pattern as value (converted into np.array) keylist= dt_strainGene.keys(); keylist.sort() strainID_keymap= {ind:k for ind, k in enumerate(keylist)} # dict(zip(keylist, range(3))) presence_arr= np.array([ np.array(dt_strainGene[k],'c') for k in keylist]) # 0: present, 3: absent presence_arr[presence_arr=='1']='3' for ind, (clusterID, gene) in enumerate(sorted_genelist): pattern_dt= { strainID_keymap[strain_ind]:str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind])} pattern_fname='%s%s_patterns.json'%(output_path,clusterID) write_json(pattern_dt, pattern_fname, indent=1)
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''):#1.0 """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln core_cutoff: percentage of strains used to decide whether a gene is core default: 1.0 (strictly core gene, which is present in all strains) customized: 0.9 ( soft core, considered as core if present in 90% of strains) """ import os,sys,operator import numpy as np import numpy.ma as ma from collections import defaultdict from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath='%s%s'%(path,'geneCluster/') output_path= alnFilePath ## create core gene list corelist=[] strain_list=load_pickle(path+'strain_list.cpk') totalStrain= len(strain_list) sorted_geneList = load_sorted_clusters(path) if core_gene_strain_fpath!='': with open(core_gene_strain_fpath,'rb') as core_gene_strain_file: core_strain_set= set([i.rstrip().replace('-','_') for i in core_gene_strain_file]) with open(output_path+'core_geneList.txt','wb') as outfile: for clusterID, vg in sorted_geneList: if core_cutoff==1.0: strain_core_cutoff=totalStrain else: strain_core_cutoff=int(totalStrain*core_cutoff) if vg[0]==vg[2] and vg[0]>=strain_core_cutoff: coreGeneName='%s%s'%(clusterID,'_na_aln.fa') ## sequences might be discarded because of premature stops coreGeneName_path= alnFilePath+coreGeneName if os.path.exists(coreGeneName_path) and len(read_fasta(coreGeneName_path)) >= strain_core_cutoff: if core_gene_strain_fpath!='' and len(core_strain_set-set([i.split('|')[0] for i in vg[1]]))!=0: continue outfile.write(coreGeneName+'\n') corelist.append(coreGeneName) else: #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene') pass write_pickle(output_path+'core_geneList.cpk',corelist) refSeqList=load_pickle(path+'strain_list.cpk');refSeqList.sort() snp_fre_lst=[]; snp_wh_matrix_flag=0 snp_pos_dt=defaultdict(list); snp_whole_matrix=np.array([]) snps_by_gene=[] for align_file in corelist:## core genes nuc_array=np.array([]) # array to store nucleotides for each gene gene_seq_dt=read_fasta(alnFilePath+align_file) if core_cutoff!=1.0: # set sequences for missing gene (space*gene_length) missing_gene_seq=' '*len(gene_seq_dt.values()[0]) totalStrain_sorted_lst=sorted(strain_list) # build strain_seq_dt from gene_seq_dt strain_seq_dt=defaultdict() for gene, seq in gene_seq_dt.iteritems(): strain_seq_dt[gene.split('-')[0]]=seq # strain-locus_tag-... strain_seq_sorted_lst=sorted(strain_seq_dt.items(), key=lambda x: x[0]) start_flag=0 if core_cutoff==1.0: for ka, va in strain_seq_sorted_lst: if start_flag==0: nuc_array=np.array(np.fromstring(va, dtype='S1')) start_flag=1 else: nuc_array=np.vstack((nuc_array,np.fromstring(va, dtype='S1'))) ## find SNP positions position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis = 0) position_has_gap = np.any(nuc_array=='-', axis=0) position_SNP = position_polymorphic&(~position_has_gap) snp_columns = nuc_array[:,position_SNP] snp_pos_dt[align_file]=np.where(position_SNP)[0] else: ## add '-' for missing genes when dealing with soft core genes core_gene_strain=[ gene for gene in strain_seq_dt.keys()] for strain in totalStrain_sorted_lst: if start_flag==0: if strain in core_gene_strain: nuc_array=np.array(np.fromstring(strain_seq_dt[strain], dtype='S1')) else: print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file) nuc_array=np.array(np.fromstring(missing_gene_seq, dtype='S1')) start_flag=1 else: if strain in core_gene_strain: nuc_array=np.vstack((nuc_array,np.fromstring(strain_seq_dt[strain], dtype='S1'))) else: print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file) nuc_array=np.vstack((nuc_array,np.fromstring(missing_gene_seq, dtype='S1'))) ## find SNP positions ## mask missing genes -- determine rows that have ' ' in every column is_missing = np.all(nuc_array==' ',axis=1) masked_non_missing_array= np.ma.masked_array(nuc_array, nuc_array==' ') position_polymorphic = np.any(masked_non_missing_array!= masked_non_missing_array[0, :],axis = 0) position_has_gap = np.any(masked_non_missing_array=='-',axis=0) position_SNP = position_polymorphic&(~position_has_gap) # the below seems duplicated from 5 lines above?? if is_missing.sum()>0: # with missing genes nuc_array[is_missing]='-' snp_columns = nuc_array[:,position_SNP] snp_pos_dt[align_file]=np.where(position_SNP)[0] #print snp_columns if snp_wh_matrix_flag==0: snp_whole_matrix=snp_columns; snp_wh_matrix_flag=1 else: snp_whole_matrix=np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path+'snp_pos.cpk',snp_pos_dt) with open(output_path+'SNP_whole_matrix.aln','wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa( outfile, refSeqList[ind], isw.tostring() )
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''): #1.0 """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln core_cutoff: percentage of strains used to decide whether a gene is core default: 1.0 (strictly core gene, which is present in all strains) customized: 0.9 ( soft core, considered as core if present in 90% of strains) """ import os, sys, operator import numpy as np import numpy.ma as ma from collections import defaultdict from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath = '%s%s' % (path, 'geneCluster/') output_path = alnFilePath ## create core gene list corelist = [] strain_list = load_pickle(path + 'strain_list.cpk') totalStrain = len(strain_list) sorted_geneList = load_sorted_clusters(path) if core_gene_strain_fpath != '': with open(core_gene_strain_fpath, 'rb') as core_gene_strain_file: core_strain_set = set( [i.rstrip().replace('-', '_') for i in core_gene_strain_file]) with open(output_path + 'core_geneList.txt', 'wb') as outfile: for clusterID, vg in sorted_geneList: if core_cutoff == 1.0: strain_core_cutoff = totalStrain else: strain_core_cutoff = int(totalStrain * core_cutoff) if vg[0] == vg[2] and vg[0] >= strain_core_cutoff: coreGeneName = '%s%s' % (clusterID, '_na_aln.fa') ## sequences might be discarded because of premature stops coreGeneName_path = alnFilePath + coreGeneName if os.path.exists(coreGeneName_path) and len( read_fasta(coreGeneName_path)) >= strain_core_cutoff: if core_gene_strain_fpath != '' and len( core_strain_set - set([i.split('|')[0] for i in vg[1]])) != 0: continue outfile.write(coreGeneName + '\n') corelist.append(coreGeneName) else: #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene') pass write_pickle(output_path + 'core_geneList.cpk', corelist) refSeqList = load_pickle(path + 'strain_list.cpk') refSeqList.sort() snp_fre_lst = [] snp_wh_matrix_flag = 0 snp_pos_dt = defaultdict(list) snp_whole_matrix = np.array([]) snps_by_gene = [] for align_file in corelist: ## core genes nuc_array = np.array([]) # array to store nucleotides for each gene gene_seq_dt = read_fasta(alnFilePath + align_file) if core_cutoff != 1.0: # set sequences for missing gene (space*gene_length) missing_gene_seq = ' ' * len(gene_seq_dt.values()[0]) totalStrain_sorted_lst = sorted(strain_list) # build strain_seq_dt from gene_seq_dt strain_seq_dt = defaultdict() for gene, seq in gene_seq_dt.iteritems(): strain_seq_dt[gene.split('-')[0]] = seq # strain-locus_tag-... strain_seq_sorted_lst = sorted(strain_seq_dt.items(), key=lambda x: x[0]) start_flag = 0 if core_cutoff == 1.0: for ka, va in strain_seq_sorted_lst: if start_flag == 0: nuc_array = np.array(np.fromstring(va, dtype='S1')) start_flag = 1 else: nuc_array = np.vstack( (nuc_array, np.fromstring(va, dtype='S1'))) ## find SNP positions position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis=0) position_has_gap = np.any(nuc_array == '-', axis=0) position_SNP = position_polymorphic & (~position_has_gap) snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = np.where(position_SNP)[0] else: ## add '-' for missing genes when dealing with soft core genes core_gene_strain = [gene for gene in strain_seq_dt.keys()] for strain in totalStrain_sorted_lst: if start_flag == 0: if strain in core_gene_strain: nuc_array = np.array( np.fromstring(strain_seq_dt[strain], dtype='S1')) else: print 'Soft core gene: gene absent in strain %s on cluster %s' % ( strain, align_file) nuc_array = np.array( np.fromstring(missing_gene_seq, dtype='S1')) start_flag = 1 else: if strain in core_gene_strain: nuc_array = np.vstack( (nuc_array, np.fromstring(strain_seq_dt[strain], dtype='S1'))) else: print 'Soft core gene: gene absent in strain %s on cluster %s' % ( strain, align_file) nuc_array = np.vstack((nuc_array, np.fromstring(missing_gene_seq, dtype='S1'))) ## find SNP positions ## mask missing genes -- determine rows that have ' ' in every column is_missing = np.all(nuc_array == ' ', axis=1) masked_non_missing_array = np.ma.masked_array( nuc_array, nuc_array == ' ') position_polymorphic = np.any( masked_non_missing_array != masked_non_missing_array[0, :], axis=0) position_has_gap = np.any(masked_non_missing_array == '-', axis=0) position_SNP = position_polymorphic & (~position_has_gap) # the below seems duplicated from 5 lines above?? if is_missing.sum() > 0: # with missing genes nuc_array[is_missing] = '-' snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = np.where(position_SNP)[0] #print snp_columns if snp_wh_matrix_flag == 0: snp_whole_matrix = snp_columns snp_wh_matrix_flag = 1 else: snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt) with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa(outfile, refSeqList[ind], isw.tostring())
def create_split_cluster_files(file_path, fname, gene_list1, gene_list2, geneCluster_dt): """ delete the old cluster and create two new clusters params: new_fa_files: list to which new file names are appeneded gene_list1/2: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated """ orgin_nwk_name = fname.split('/')[-1] clusterID = orgin_nwk_name.replace('.nwk', '') origin_cluster_nu_fa = orgin_nwk_name.replace('nwk', 'fna') origin_cluster_aa_fa = orgin_nwk_name.replace('nwk', 'faa') split_fa_files_set = set() ## load genes from old clusters origin_nu_fa_dt = read_fasta(file_path + origin_cluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path + origin_cluster_aa_fa) sgs_index = 0 ## delete old (split) clusters try: #print('deleting:',orgin_nwk_name) ##debug: ##print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID) del geneCluster_dt[clusterID] with open(file_path + 'old_clusters_paralogSplit.txt', 'a') as delete_cluster_file: delete_cluster_file.write('%s\n' % clusterID) if os.path.exists(fname): suffix_list = [ '_aa_aln.fa', '_na_aln.fa', '.fna', '.faa', '.nwk', '_tree.json' ] else: suffix_list = ['_aa_aln.fa', '_na_aln.fa', '.fna', '.faa'] tmp_files = ' '.join( [file_path + clusterID + suffix for suffix in suffix_list]) command_move_deleted_clusters = ' '.join( ['mv', tmp_files, file_path + 'paralog_splits/']) os.system(command_move_deleted_clusters) except: print("paralog splitting: can't delete", orgin_nwk_name) ##debug: ##print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID) ## write new cluster fa files ## split_gene_list has geneSeqID instead of geneID for split_gene_list in (list(gene_list1), list(gene_list2)): sgs_index += 1 newClusterId = "%s_p%s" % (clusterID, sgs_index) gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna') gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa') gene_cluster_nu_write = open(file_path + gene_cluster_nu_filename, 'wb') gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename, 'wb') split_fa_files_set |= set([file_path + gene_cluster_nu_filename]) ## write new split cluster files for gene_memb in split_gene_list: if "\\'" in gene_memb: gene_memb = gene_memb.replace("\\'", "'") try: write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) except: print 'paralogy splitting (problem to write new split cluster files)', fname #, gene_memb, gene_list1, gene_list2 gene_cluster_nu_write.close() gene_cluster_aa_write.close() geneCluster_dt[newClusterId] = [0, [], 0] ## num_stains geneCluster_dt[newClusterId][0] = len( dict(Counter([ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes geneCluster_dt[newClusterId][2] = len( dict(Counter([ig for ig in split_gene_list])).keys()) ## gene members geneCluster_dt[newClusterId][1] = [ ig.split('-')[0] for ig in split_gene_list ] return split_fa_files_set
def output_cutted_clusters(file_path, uncluster_filename, gene_list, cut_branch_threshold, treefile_used=None, cut_leftover=None): """ delete the unclustered file and create new clusters params: gene_list: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated cut_leftover: flag to indicate whether there are the leftover nodes after cutting long branches. Default: empty. """ clusterID = uncluster_filename.replace('.fna','') origin_uncluster_nu_fa = uncluster_filename origin_uncluster_aa_fa = uncluster_filename.replace('fna','faa') new_fa_files=set() ## load origin cluster fa files origin_nu_fa_dt = read_fasta(file_path+origin_uncluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path+origin_uncluster_aa_fa) ## split_gene_list has geneSeqID instead of geneID for sgs_index,split_gene_list in enumerate(gene_list,1): if cut_leftover==True: ## newClusterId for the rest genes (_r as identifier) newClusterId="%s_r%s"%(clusterID,sgs_index) else: newClusterId="%s_%s"%(clusterID,sgs_index) #============================================= ## write new divided/split cluster files gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna') gene_cluster_nu_filepath= file_path+gene_cluster_nu_filename gene_cluster_nu_write=open(gene_cluster_nu_filepath , 'wb') gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa') gene_cluster_aa_filepath= file_path+gene_cluster_aa_filename gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb') for gene_memb in split_gene_list: if "\\'" in gene_memb: # Replace '\' in node name: ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError: ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'" gene_memb=gene_memb.replace("\\'","'") write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) gene_cluster_nu_write.close(); gene_cluster_aa_write.close(); #============================================= if cut_leftover==True: ## align the rest genes, build tree, cut long branches till nothing can be cutted. cutTree_outputCluster([gene_cluster_nu_filepath],file_path, cut_branch_threshold, treefile_used) else: ## record the misclusters to be deleted (already addressed in cutTree_outputCluster ) ## it will output the same cluster several times #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file: # delete_cluster_file.write('%s\n'%uncluster_filename) ## add record in new_clusters_longSplit.txt, which is used for align new clusters new_fa_files.add(gene_cluster_nu_filepath) ## write cluster statistics in folder update_long_branch_splits addin_geneCluster_dt=defaultdict(list) addin_geneCluster_dt[ newClusterId ] = [0,[],0] ## num_stains addin_geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes addin_geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys()) ## gene members addin_geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ] ## cPickle new cluster statistics write_pickle(''.join([file_path,'update_long_branch_splits/', newClusterId,'.cpk']),addin_geneCluster_dt) ## write records in gene_diversity file with open(file_path+'new_clusters_longSplit.txt', 'a') as refined_cluster_file: for i in new_fa_files: refined_cluster_file.write('%s\n'%i)
def output_cutted_clusters(file_path, uncluster_filename, gene_list, cut_branch_threshold, treefile_used=None, cut_leftover=None): """ delete the unclustered file and create new clusters params: gene_list: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated cut_leftover: flag to indicate whether there are the leftover nodes after cutting long branches. Default: empty. """ clusterID = uncluster_filename.replace('.fna', '') origin_uncluster_nu_fa = uncluster_filename origin_uncluster_aa_fa = uncluster_filename.replace('fna', 'faa') new_fa_files = set() ## load origin cluster fa files origin_nu_fa_dt = read_fasta(file_path + origin_uncluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path + origin_uncluster_aa_fa) ## split_gene_list has geneSeqID instead of geneID for sgs_index, split_gene_list in enumerate(gene_list, 1): if cut_leftover == True: ## newClusterId for the rest genes (_r as identifier) newClusterId = "%s_r%s" % (clusterID, sgs_index) else: newClusterId = "%s_%s" % (clusterID, sgs_index) #============================================= ## write new divided/split cluster files gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna') gene_cluster_nu_filepath = file_path + gene_cluster_nu_filename gene_cluster_nu_write = open(gene_cluster_nu_filepath, 'wb') gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa') gene_cluster_aa_filepath = file_path + gene_cluster_aa_filename gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename, 'wb') for gene_memb in split_gene_list: if "\\'" in gene_memb: # Replace '\' in node name: ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError: ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'" gene_memb = gene_memb.replace("\\'", "'") write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) gene_cluster_nu_write.close() gene_cluster_aa_write.close() #============================================= if cut_leftover == True: ## align the rest genes, build tree, cut long branches till nothing can be cutted. cutTree_outputCluster([gene_cluster_nu_filepath], file_path, cut_branch_threshold, treefile_used) else: ## record the misclusters to be deleted (already addressed in cutTree_outputCluster ) ## it will output the same cluster several times #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file: # delete_cluster_file.write('%s\n'%uncluster_filename) ## add record in new_clusters_longSplit.txt, which is used for align new clusters new_fa_files.add(gene_cluster_nu_filepath) ## write cluster statistics in folder update_long_branch_splits addin_geneCluster_dt = defaultdict(list) addin_geneCluster_dt[newClusterId] = [0, [], 0] ## num_stains addin_geneCluster_dt[newClusterId][0] = len( dict(Counter([ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes addin_geneCluster_dt[newClusterId][2] = len( dict(Counter([ig for ig in split_gene_list])).keys()) ## gene members addin_geneCluster_dt[newClusterId][1] = [ ig.split('-')[0] for ig in split_gene_list ] ## cPickle new cluster statistics write_pickle( ''.join([ file_path, 'update_long_branch_splits/', newClusterId, '.cpk' ]), addin_geneCluster_dt) ## write records in gene_diversity file with open(file_path + 'new_clusters_longSplit.txt', 'a') as refined_cluster_file: for i in new_fa_files: refined_cluster_file.write('%s\n' % i)
def create_split_cluster_files(file_path, fname, gene_list1, gene_list2, geneCluster_dt): """ delete the old cluster and create two new clusters params: new_fa_files: list to which new file names are appeneded gene_list1/2: lists containing the genes in the new split clusters geneCluster_dt: cluster dictionary to be updated """ orgin_nwk_name = fname.split('/')[-1] clusterID = orgin_nwk_name.replace('.nwk','') origin_cluster_nu_fa = orgin_nwk_name.replace('nwk','fna') origin_cluster_aa_fa = orgin_nwk_name.replace('nwk','faa') split_fa_files_set=set() ## load genes from old clusters origin_nu_fa_dt = read_fasta(file_path+origin_cluster_nu_fa) origin_aa_fa_dt = read_fasta(file_path+origin_cluster_aa_fa) sgs_index=0 ## delete old (split) clusters try: #print('deleting:',orgin_nwk_name) ##debug: ##print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID) del geneCluster_dt[clusterID] with open(file_path+'old_clusters_paralogSplit.txt', 'a') as delete_cluster_file: delete_cluster_file.write('%s\n'%clusterID) if os.path.exists(fname): suffix_list=['_aa_aln.fa','_na_aln.fa','.fna','.faa','.nwk','_tree.json'] else: suffix_list=['_aa_aln.fa','_na_aln.fa','.fna','.faa'] tmp_files=' '.join([ file_path+clusterID+suffix for suffix in suffix_list ]) command_move_deleted_clusters=' '.join(['mv', tmp_files, file_path+'paralog_splits/']) os.system(command_move_deleted_clusters) except: print("paralog splitting: can't delete",orgin_nwk_name) ##debug: ##print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID) ## write new cluster fa files ## split_gene_list has geneSeqID instead of geneID for split_gene_list in (list(gene_list1), list(gene_list2)): sgs_index+=1 newClusterId="%s_p%s"%(clusterID,sgs_index) gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna') gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa') gene_cluster_nu_write=open( file_path+gene_cluster_nu_filename, 'wb') gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb') split_fa_files_set |= set([file_path+gene_cluster_nu_filename]) ## write new split cluster files for gene_memb in split_gene_list: if "\\'" in gene_memb: gene_memb=gene_memb.replace("\\'","'") try: write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb]) write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb]) except: print 'paralogy splitting (problem to write new split cluster files)', fname #, gene_memb, gene_list1, gene_list2 gene_cluster_nu_write.close(); gene_cluster_aa_write.close(); geneCluster_dt[ newClusterId ] = [0,[],0] ## num_stains geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys()) ## num_genes geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys()) ## gene members geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ] return split_fa_files_set
def gbk_translation(strainID, gbk_fname, protein_fname, nucleotide_fname, RNA_fname, geneID_to_geneSeqID_dict,geneID_to_description_dict, RNAID_to_SeqID_dict, RNAID_to_description_dict, gene_aa_dict, gene_na_dict, RNA_dict, enable_RNA_clustering): ''' extract sequences and meta informations of all genes in one reference genbank file params: - gbk_fname: Genbank filename - protein_fname: file into which all amino acid sequences are written in fasta format. needed as input for diamond - nucleotide_fname: file into which all nucleotide sequences are written in fasta format. needed for cluster sequences - RNA_fname: RNA nucleotide_sequences are written in fasta format. Needed as RNA_blast_input - geneID_to_geneSeqID_dict: dictionary linking geneID to gene sequence ID modified in place (key: geneID; value: geneSeqID ) - geneID_to_description_dict: dictionary linking geneID to description info modified in place (key: geneID; value: a dict including information on contig_index, annotation or more) - RNAID_to_SeqID_dict: dictionary linking RNAID to RNA sequence ID modified in place (key: RNAID; value: SeqID ) - RNAID_to_description_dict: dictionary linking RNAID to description info modified in place (key: RNAID; value: a dict including information on contig_index, annotation or more) - enable_RNA_clustering: not cluster rRNA ''' aa_sequence_file=open(protein_fname, 'wb') nu_sequence_file=open(nucleotide_fname, 'wb') if enable_RNA_clustering: RNA_sequence_file=open(RNA_fname, 'wb') contig_index=0 check_CDS_passed=0 for contig in SeqIO.parse(gbk_fname,'genbank'): contig_index+=1 for feature in contig.features: if feature.type=='CDS': if not check_CDS_passed: check_CDS_passed=1 if 'product' in feature.qualifiers and 'translation' in feature.qualifiers : if 'gene' in feature.qualifiers : geneName='%s'%(feature.qualifiers['gene'][0]).replace(' ','_') else: geneName='' product=feature.qualifiers['product'][0] annotation= '_'.join(product.split(' ')) trans_seq=feature.qualifiers['translation'][0] if 'locus_tag' in feature.qualifiers: locus_tag=feature.qualifiers['locus_tag'][0] else: locus_tag=feature.qualifiers['db_xref'][0].split(':')[1] ## force to replace '-' with '_' in locus_tag if '-' in locus_tag: locus_tag=locus_tag.replace('-','_') if "PROKKA" in locus_tag: locus_tag=locus_tag.replace('PROKKA_','') if '%s_'%strainID in locus_tag: locus_tag=locus_tag.split('%s_'%strainID)[1] ## geneID is composed of strain_name and locus_tag ## Keeping '|' separator is important, which is used later in orthAgogue. geneID= '%s|%s'%(strainID,locus_tag) na_seq=str(feature.extract(contig.seq)) write_in_fa(aa_sequence_file, geneID, trans_seq) write_in_fa(nu_sequence_file, geneID, na_seq) gene_aa_dict[strainID][geneID]=trans_seq gene_na_dict[strainID][geneID]=na_seq # give tag 'gname:' to genes which have gene name and separate it from annotation geneID_to_description_dict[geneID]={'geneName': geneName, 'contig': contig_index, 'annotation': annotation} if geneName!='': geneName='%s_'%geneName geneID_to_geneSeqID_dict[geneID]='%s|%s-%d-%s%s'%(strainID, locus_tag, contig_index, geneName, annotation) elif enable_RNA_clustering and (feature.type=='rRNA'): #elif not enable_RNA_clustering and (feature.type=='rRNA' or feature.type=='tRNA'): if 'product' in feature.qualifiers: geneName='' product=feature.qualifiers['product'][0] annotation= '_'.join(product.split(' ')) try: locus_tag=feature.qualifiers['locus_tag'][0] except: # make a random string when locus_tag absent locus_tag=time.strftime('%S',time.gmtime())+str(random.randint(0,10000000)) if "PROKKA" in locus_tag: locus_tag=locus_tag.replace('PROKKA_','') if '%s_'%strainID in locus_tag: locus_tag=locus_tag.split('%s_'%strainID)[1] ## RNA is composed of strain_name and locus_tag ## Keeping '|' separator is important, which is used later in orthAgogue. RNAID= '%s|%s'%(strainID,locus_tag) RNA_seq= str(feature.extract(contig.seq)) write_in_fa(RNA_sequence_file, RNAID, RNA_seq) RNA_dict[strainID][RNAID]=RNA_seq # give tag 'gname:' to genes which have gene name and separate it from annotation RNAID_to_description_dict[RNAID]={ 'geneName': '', 'contig': contig_index, 'annotation': annotation} RNAID_to_SeqID_dict[RNAID]='%s|%s-%d-%s%s'%(strainID, locus_tag, contig_index, geneName, annotation) aa_sequence_file.close(); nu_sequence_file.close() return check_CDS_passed