def RNAclusters_align_makeTree(path, parallel): """ create RNA clusters as nucleotide fasta files and build individual RNA trees based on fna files """ diamond_RNACluster_dt = create_RNACluster_fa(path) ## align, build_tree, make_RNATree_json fasta_path = path + 'geneCluster/' fa_files = glob.glob(fasta_path + "*RNA*.fna") multips(single_RNACluster_align_and_makeTree, fasta_path, parallel, fa_files) ## add RNA cluster in diamond_geneCluster_dt ### load gene cluster geneClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/') os.system( 'cp %sorthamcl-allclusters_final.cpk %s/orthamcl-allclusters_final.cpk.bk ' % (geneClusterPath, geneClusterPath)) diamond_geneCluster_dt = load_pickle(geneClusterPath + 'orthamcl-allclusters_final.cpk') ### update gene cluster with RNA cluster update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt) ### update diversity file update_diversity_cpk_file(path)
def postprocess_paralogs_iterative(parallel, path, nstrains, branch_length_cutoff=500, paralog_cutoff=0.5, plot=False): cluster_path= path+'protein_faa/diamond_matches/' diamond_geneCluster_dt=load_pickle(cluster_path+'orthamcl-allclusters.cpk') split_result= postprocess_paralogs( parallel, path, nstrains, diamond_geneCluster_dt, set(), branch_length_cutoff=branch_length_cutoff, paralog_cutoff=paralog_cutoff, plot=plot) n_split_clusters, new_fa_files_set = split_result iteration=0 while(n_split_clusters): print('---- split a total of ',n_split_clusters, 'in iteration', iteration) split_result= postprocess_paralogs( parallel, path, nstrains, diamond_geneCluster_dt, new_fa_files_set, branch_length_cutoff=branch_length_cutoff, paralog_cutoff=paralog_cutoff, plot=plot) n_split_clusters, new_fa_files_set = split_result iteration+=1 # output_path = path+'geneCluster/' # with open(output_path+'gene_diversity.txt', 'rb') as infile: # write_pickle(output_path+'gene_diversity.cpk',{ i.rstrip().split('\t')[0]:i.rstrip().split('\t')[1] for i in infile}) ## write gene_diversity_Dt cpk file update_diversity_cpk_file(path) ## remove old gene cluster and create new split cluster update_gene_cluster(path, diamond_geneCluster_dt )
def make_genepresence_alignment(path): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/') output_path = '%s%s' % (path, 'geneCluster/') ## load strain list and prepare for gene presence/absence strain_list = load_pickle(path + 'strain_list.cpk') set_totalStrain = set([istrain for istrain in strain_list]) totalStrain = len(set_totalStrain) dt_strainGene = defaultdict(list) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for gid, (clusterID, gene) in enumerate(sorted_genelist): gene_list = gene[1] ## append 0/1 to each strain dt_strainGene = create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene_list) with open(output_path + 'genePresence.aln', 'wb') as presence_outfile: for istkey in dt_strainGene: dt_strainGene[istkey] = ''.join(dt_strainGene[istkey]) write_in_fa(presence_outfile, istkey, dt_strainGene[istkey]) write_pickle(output_path + 'dt_genePresence.cpk', dt_strainGene)
def create_geneCluster_fa(): """ dict storing amino_acid Id/Seq from '.faa' files input: '.faa', '_gene_nuc_dict.cpk', '-orthamcl-allclusters.cpk' output: """ ## make sure the geneCluster folder is empty if os.path.isdir(path+'geneCluster/')==True: print 'remove previous folder: ',path+'geneCluster/' os.system('rm -rf %s'%(path+'geneCluster/')) faa_path=path+'protein_faa/' ## dict storing all genes' translation gene_aa_dict=defaultdict(list) for ifaa in glob.glob(faa_path+"*.faa"): gene_aa_dict.update(read_fasta(ifaa)) ## dict storing nucleotide Id/Seq from '_gene_nuc_dict.cpk' files istrain_cpk={}; strain_list= load_pickle(path+'strain_list.cpk'); nucleotide_dict_path= '%s%s'%(path,'nucleotide_fna/') for istrain in strain_list: istrain_cpk[istrain]=load_pickle(nucleotide_dict_path+istrain+'_gene_nuc_dict.cpk') ## load gene cluster cpk file geneCluster_path=faa_path+'diamond_matches/' diamond_geneCluster_dt=load_pickle(geneCluster_path+'orthamcl-allclusters.cpk') ## load geneID_to_geneSeqID geneSeqID cpk file geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk') ## create cluster-genes fasta files fasta_path=path+'geneCluster/'; os.system('mkdir '+fasta_path) ## diamond_geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes } for clusterID, gene in diamond_geneCluster_dt.iteritems(): ## geneCluster file name gene_cluster_nu_filename="%s%s"%(clusterID,'.fna') gene_cluster_aa_filename="%s%s"%(clusterID,'.faa') gene_cluster_nu_write=open( fasta_path+gene_cluster_nu_filename, 'wb') gene_cluster_aa_write=open( fasta_path+gene_cluster_aa_filename, 'wb') ## write nucleotide/amino_acid sequences into geneCluster files for gene_memb in gene[1]: ## gene_name format: strain_1|locusTag strain_name= gene_memb.split('|')[0] gene_memb_seq=str(istrain_cpk[strain_name][gene_memb]) geneSeqID=geneID_to_geneSeqID_dict[gene_memb] write_in_fa(gene_cluster_nu_write, geneSeqID, gene_memb_seq ) write_in_fa(gene_cluster_aa_write,geneSeqID, gene_aa_dict[gene_memb]) gene_cluster_nu_write.close(); gene_cluster_aa_write.close();
def create_RNACluster_fa(path): """ input: '.fna', '_RNA_nuc_dict.cpk', '-orthamcl-allclusters.cpk' output: '.aln', 'tree.json', etc """ if 0: ## make sure the RNACluster folder is empty if os.path.isdir(path + 'RNACluster/') == True: print 'remove previous folder: ', path + 'RNACluster/' os.system('rm -rf %s' % (path + 'RNACluster/')) ## dict storing nucleotide Id/Seq from '_RNA_nuc_dict.cpk' files istrain_cpk = {} strain_list = load_pickle(path + 'strain_list.cpk') nucleotide_dict_path = '%s%s' % (path, 'nucleotide_fna/') for istrain in strain_list: istrain_cpk[istrain] = load_pickle(nucleotide_dict_path + istrain + '_RNA_nuc_dict.cpk') ## load RNA cluster cpk file RNACluster_path = path + 'RNA_fna/' diamond_RNACluster_dt = load_pickle(RNACluster_path + 'orthamcl-allclusters.cpk') ## load RNAID_to_RNASeqID RNASeqID cpk file RNAID_to_RNASeqID_dict = load_pickle(path + 'RNAID_to_SeqID.cpk') ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder) fasta_path = path + 'geneCluster/' ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs } for clusterID, RNA in diamond_RNACluster_dt.iteritems(): ## RNACluster file name RNA_cluster_nu_filename = "%s%s" % (clusterID, '.fna') RNA_cluster_nu_write = open(fasta_path + RNA_cluster_nu_filename, 'wb') ## write nucleotide/amino_acid sequences into RNACluster files for RNA_memb in RNA[1]: ## RNA_name format: strain_1|locusTag strain_name = RNA_memb.split('|')[0] RNA_memb_seq = str(istrain_cpk[strain_name][RNA_memb]) RNASeqID = RNAID_to_RNASeqID_dict[RNA_memb] write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq) RNA_cluster_nu_write.close() return diamond_RNACluster_dt
def json_parser(path, species, meta_info_file_path): """ create json file for web-visualiaztion input: tree_result.newick, *metainfo_curated.tsv output: json files for gene cluster table and core gene SNP tree """ from ete2 import Tree metaFile = path + 'metainfo_curated.tsv' if meta_info_file_path == 'none': metaFile = path + 'metainfo_curated.tsv' else: ## create a link of user meta_info_file os.system('pwd') os.system('cp %s %s' % (meta_info_file_path, metaFile)) output_path = '%s%s' % (path, 'geneCluster/') visualzition_path = '%s%s' % (path, 'Vis/') tree = Tree(output_path + 'tree_result.newick', format=1) dt_genePresence = load_pickle(path + 'geneCluster/dt_genePresence.cpk') ## create tree json files jsonString = json.dumps( create_json_addLabel(species, dt_genePresence, tree, 0, path, metaFile)) jsonString1 = json.dumps( create_json_addLabel(species, dt_genePresence, tree, 1, path, metaFile)) os.chdir(output_path) with open('coreGenomeTree.json', 'wb') as write_json: write_json.write(jsonString) with open('coreGenomeTree-noBranch.json', 'wb') as write_json1: write_json1.write(jsonString1) ## create tnt-nodeAttri-dataTable.json and tnt-nodeAttri.json for tree tables json_tnt_parser() ## move all *.cpk file to ./data/YourSpecies/ folder ## coreGenomeTree.json and strainMetainfo.json file to ./data/YourSpecies/vis/ folder ## GC*json file to ./data/YourSpecies/vis/geneCluster/ folder current_path = os.getcwd() os.system('ln -sf %s/*.cpk %s/../' % (current_path, current_path)) os.system( 'mv coreGenomeTree.json strainMetainfo.json geneGainLossEvent.json ../vis/;' ) os.system('mv GC*.aln GC*_tree.json ../vis/geneCluster/;') print( 'Pan-genome analysis is finished, your data can be transfered to the local server for data visualization and exploration via link-to-server.py in the main folder.' )
def postprocess_unclustered_genes(parallel, path, nstrains, window_size_smoothed=5, strain_proportion=0.3, sigma_scale=3): # 1) detect suspicious peaks in the distribution of average length of genes in gene cluster (aa count) # np.bincount([1,2,3,34,3]) -> how often each entry is found # np.convolve([1,1,1,1,1], gene_length_count) # -> unclustered genes will contribute many small clusters (size 1) # that result in peaks in the distribution # 2) for each peak detected, align the sequences of all genes in clusters in peak # 3) to cluster aligned genes, build tree. However, to ensure long branches # -> between unaligned sub-alignment, fill gaps with random sequence (skipped, not tested) # importantly, this random sequence needs to be the same in different columns of the alignment. # - random sequence a la rseq = [alpha[ii] for ii in np.random.randint(len(alpha), size=aln.get_aligmentlength())] # - for seq in aln: seq[seq=='-'] = rseq[seq=='-'] # 4) make and split tree at branches >.5 # 5) for each subtree (ideally only one big tree), define new gene cluster and run # maketree_align from standard step 6 ## load clusters geneClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/') diamond_geneCluster_dt = load_pickle(geneClusterPath + 'orthamcl-allclusters_final.cpk') ## merge unclustered genes merged_clusters_dict = defaultdict(list) merged_clusters_dict = find_and_merge_unclustered_genes( path, nstrains, window_size_smoothed, strain_proportion, sigma_scale) ## cut tree and make new clusters cut_tree_from_merged_clusters(parallel, path, diamond_geneCluster_dt, merged_clusters_dict) ## write new clusters in orthamcl-allclusters_final.cpk os.system( 'cp %sorthamcl-allclusters_final.cpk %s/orthamcl-allclusters_final.cpk.bk ' % (geneClusterPath, geneClusterPath)) update_gene_cluster(path, diamond_geneCluster_dt) ## write gene_diversity_Dt cpk file update_diversity_cpk_file(path)
def load_sorted_clusters(path): ''' load gene clusters and sort 1st by abundance and then by clusterID ''' geneClusterPath='%s%s'%(path,'protein_faa/diamond_matches/') diamond_geneCluster_dt=load_pickle(geneClusterPath+'orthamcl-allclusters_final.cpk') from operator import itemgetter # sort by decreasing abundance (-v[0], minus to achieve decreasing) # followed by increasing clusterID GC_00001 return sorted(diamond_geneCluster_dt.iteritems(), key=lambda (k,v): (-itemgetter(0)(v),k), reverse=False) #=============================================# # postprocessing unclustered genes (peaks) #=============================================# # #from SF06_2_unclustered_genes import find_and_merge_unclustered_genes # from SF06_2_unclustered_genes import find_and_merge_unclustered_genes, cut_tree_from_merged_clusters # def postprocess_unclustered_genes(n_threads, path, nstrains, window_size=5, strain_proportion=0.3 , sigma_scale=3): # diamond_geneCluster_dt=load_pickle(geneClusterPath+'orthamcl-allclusters_final.cpk') # find_and_merge_unclustered_genes(n_threads, path, nstrains, window_size, strain_proportion , sigma_scale) # cut_tree_from_merged_clusters(path)
print path species=strain_list.split('-RefSeq')[0] def load_strains(): """ load input strains in strain_list """ if os.path.isfile(path+strain_list): with open(path+strain_list,'rb') as infile: write_pickle(path+'strain_list.cpk', [ ist.rstrip().split('.gbk')[0] for ist in infile] ) if 1 in params.steps: #step 01: load_strains() print 'step01-refSeq strain list successfully found.' ## load strain_list.cpk file and give the total number of strains if os.path.isfile(path+'strain_list.cpk'): strain_lst= load_pickle(path+'strain_list.cpk') nstrains =len([ istrain for istrain in strain_lst ]) if 2 in params.steps:# step02: start = time.time() accessionID_single(path, strain_lst) print 'step02-download NCBI refseq GenBank file from strain list:' print times(start) if 3 in params.steps:# step03: start = time.time() diamond_input(path, strain_lst, params.disable_RNA_clustering) print 'step03-create input file for Diamond from genBank file (.gb):' print times(start) if 4 in params.steps:# step04:
def create_core_SNP_matrix(path): """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln """ import os, sys, operator import numpy as np from collections import defaultdict from SF00_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath = '%s%s' % (path, 'geneCluster/') output_path = alnFilePath ## create core gene list corelist = [] totalStrain = len(load_pickle(path + 'strain_list.cpk')) sorted_geneList = load_sorted_clusters(path) with open(output_path + 'core_geneList.txt', 'wb') as outfile: for clusterID, vg in sorted_geneList: if vg[0] == totalStrain and vg[2] == totalStrain: coreGeneName = '%s%s' % (clusterID, '_na.aln') ## sequences might be discarded because of premature stops coreGeneName_path = alnFilePath + coreGeneName if os.path.exists(coreGeneName_path) and len( read_fasta(coreGeneName_path)) == totalStrain: outfile.write(coreGeneName + '\n') corelist.append(coreGeneName) else: print '%s%s%s' % ('warning: ', coreGeneName_path, ' is not a core gene') write_pickle(output_path + 'core_geneList.cpk', corelist) refSeqList = load_pickle(path + 'strain_list.cpk') refSeqList.sort() snp_fre_lst = [] snp_wh_matrix_flag = 0 snp_pos_dt = defaultdict(list) snp_whole_matrix = np.array([]) snps_by_gene = [] for align_file in corelist: ## all core genes fa_dt = read_fasta(alnFilePath + align_file) fa_sorted_lst = sorted(fa_dt.items(), key=lambda x: x[0].split('|')[0]) nuc_array = np.array([]) flag = 0 for ka, va in enumerate(fa_sorted_lst): if flag == 0: flag = 1 nuc_array = np.array(np.fromstring(va[1], dtype='S1')) else: nuc_array = np.vstack( (nuc_array, np.fromstring(va[1], dtype='S1'))) position_polymorphic = np.where( np.all(nuc_array == nuc_array[0, :], axis=0) == False)[0] position_has_gap = np.where(np.any(nuc_array == '-', axis=0))[0] position_SNP = np.setdiff1d(position_polymorphic, position_has_gap) snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = position_SNP if snp_wh_matrix_flag == 0: snp_whole_matrix = snp_columns snp_wh_matrix_flag = 1 else: snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt) with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa(outfile, refSeqList[ind], isw.tostring())
def gbk_To_Metainfo(path): """ extract metainfo (date/country) from genBank file This step is not necessary if the user provides a tab-delimited meta-information table as path/"metainfo_curated.tsv" Input: genBank file Output: metainfo csv file """ import os, sys from Bio import SeqIO from SF00_miscellaneous import load_pickle each_gbk_path = '%s%s' % (path, 'input_GenBank/') strainList = load_pickle(path + 'strain_list.cpk') writeseq = open(path + 'metainfo.tsv', 'wb') # write the headers: # default: accName, strainName, antiBiotics, dateInfo, country, host writeseq.write("%s\n" % ('\t'.join( ['accName', 'strainName', 'collection_date', 'country', 'host']))) # check each genBank file to get meta-type for eachstrain in strainList: for index, record in enumerate( SeqIO.parse(open(each_gbk_path + eachstrain + '.gbk'), "genbank")): for i, feature in enumerate(record.features): if feature.type == 'source': host, datacolct, country, strainName = '', '', '', '' if 'strain' in feature.qualifiers: strainName = feature.qualifiers['strain'][0] else: strainName = 'unknown' if 'host' in feature.qualifiers: host = feature.qualifiers['host'][0] else: host = 'unknown' if 'collection_date' in feature.qualifiers: datacolct = feature.qualifiers['collection_date'][0] if 'country' in feature.qualifiers: country = feature.qualifiers['country'][0] country = country.split(':')[0] #USA: New... else: country = 'unknown' # date processing import re, calendar datacolct = ''.join(datacolct.split('-')) dates = re.findall('\d+', datacolct) # two versions of date: 15-Seq-2011/2014-03-14 if sum([str.isalpha(ic) for ic in datacolct]) != 0: month_abbr = re.findall('[a-zA-Z]+', datacolct)[0] month = str( list(calendar.month_abbr).index(month_abbr)) if len(datacolct) == 9: if len(month) == 1: month = '0' + month datacolct = dates[1] + '-' + month + '-' + dates[0] else: if len(month) == 1: month = '0' + month datacolct = dates[ 0] + '-' + month + '-01' #artificial day 01 elif datacolct != '': if len(datacolct) == 8: datacolct = '%s-%s-%s' % ( dates[0][:4], dates[0][4:6], dates[0][6:]) elif len(datacolct) == 6: #'2010-05' datacolct = '%s-%s-01' % (dates[0][:4], dates[0][4:6]) else: datacolct = dates[0] + '-01-01' elif datacolct == '': datacolct = 'unknown' # just get the year datacolct = datacolct.split('-')[0] # antibiotic default: unknown # antibio='unknown' break #writeseq.write( "%s\n"%('\t'.join([eachstrain, antibio, datacolct, country, host])) ) writeseq.write("%s\n" % ('\t'.join( [eachstrain, strainName, datacolct, country, host]))) writeseq.close() os.system('mv %smetainfo.tsv %smetainfo_curated.tsv' % (path, path))
def geneCluster_to_json(path, disable_RNA_clustering): """ create json file for gene cluster table visualzition input: path to genecluster output output: geneCluster.json """ # load geneID_to_description_dict geneID_to_description_dict = load_pickle(path + 'geneID_to_description.cpk') if disable_RNA_clustering == 0: # load RNAID_to_description_file geneID_to_description_dict.update( load_pickle(path + 'RNAID_to_description.cpk')) output_path = '%s%s' % (path, 'geneCluster/') visualzition_path = '%s%s' % (path, 'vis/') os.system('mkdir %s; mkdir %sgeneCluster/' % (visualzition_path, visualzition_path)) write_file_lst_json = open(visualzition_path + 'geneCluster.json', 'wb') gene_diversity_Dt = load_pickle(output_path + 'gene_diversity.cpk') ## sorted clusters sorted_genelist = load_sorted_clusters(path) ## prepare geneId_Dt_to_locusTag #geneId_Dt_to_locusTag=defaultdict(list) #geneId_Dt_to_locusTag={v:k for k,v in locusTag_to_geneId_Dt.items()} ## load gain/loss event count dictionary dt_geneEvents = load_pickle(output_path + 'dt_geneEvents.cpk') write_file_lst_json.write('[') begin = 0 ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for gid, (clusterID, gene) in enumerate(sorted_genelist): strain_count, gene_list, gene_count = gene if begin == 0: begin = 1 else: write_file_lst_json.write(',\n') ## annotation majority allAnn, majority_annotation = consolidate_annotation( path, gene_list, geneID_to_description_dict) ## geneName majority all_geneName, majority_geneName = consolidate_geneName( path, gene_list, geneID_to_description_dict) #break ## extract gain/loss event count gene_event = dt_geneEvents[gid] ## average length #start = time.time() geneLength_list = [ len(igene) for igene in read_fasta(output_path + '%s%s' % (clusterID, '.fna')).values() ] geneClusterLength = sum(geneLength_list) // len(geneLength_list) #print geneLength_list,geneClusterLength #print 'average length:', times(start) ## msa geneCluster_aln = '%s%s' % (clusterID, '_aa.aln') ## check for duplicates if gene_count > strain_count: duplicated_state = 'yes' dup_list = [ig.split('|')[0] for ig in gene_list] # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes # Counter({'g1': 2, 'g2': 1}) dup_detail = ''.join([ '%s#%s@' % (kd, vd) for kd, vd in dict(Counter(dup_list)).items() if vd > 1 ])[:-1] else: duplicated_state = 'no' dup_detail = '' ## locus_tag locus_tag_strain = ' '.join([igl for igl in gene_list]) #locus_tag_strain=' '.join([ '%s_%s'%(igl.split('|')[0],geneId_Dt_to_locusTag[igl]) for igl in gene[1][1] ]) ## write json newline = '{"geneId":%d,"geneLen":%d,"count": %d,"dupli":"%s","dup_detail": "%s","ann":"%s","msa":"%s","divers":"%s","event":"%s","allAnn":"%s", "GName":"%s", "allGName":"%s", "locus":"%s"}' write_file_lst_json.write( newline % (gid + 1, geneClusterLength, strain_count, duplicated_state, dup_detail, majority_annotation, geneCluster_aln, gene_diversity_Dt[clusterID], gene_event, allAnn, majority_geneName, all_geneName, locus_tag_strain)) write_file_lst_json.write(']') write_file_lst_json.close()