def export_gain_loss(tree, path): ''' ''' # write final tree with internal node names as assigned by treetime sep = '/' output_path = sep.join([path.rstrip(sep), 'geneCluster/']) tree_fname = sep.join([output_path, 'tree_result.newick']) Phylo.write(tree.tree, tree_fname, 'newick') from collections import defaultdict gene_gain_loss_dict = defaultdict(str) for node in tree.tree.find_clades( order='preorder'): # order does not matter much here if node.up is None: continue #print(node.name ,len(node.geneevents),node.geneevents) gain_loss = [ str(int(ancestral) * 2 + int(derived)) for ancestral, derived in zip(node.up.genepresence, node.genepresence) ] gene_gain_loss_dict[node.name] = "".join(gain_loss) gain_loss_array = np.array( [[i for i in gain_loss_str] for gain_loss_str in gene_gain_loss_dict.values()], dtype=int) # 1 and 2 are codes for gain/loss events events_array = ((gain_loss_array == 1) | (gain_loss_array == 2)).sum(axis=0) events_dict = {index: event for index, event in enumerate(events_array)} events_dict_path = sep.join([output_path, 'dt_geneEvents.cpk']) write_pickle(events_dict_path, events_dict) # export gene loss dict to json for visualization gene_loss_fname = sep.join([output_path, 'geneGainLossEvent.json']) write_json(gene_gain_loss_dict, gene_loss_fname, indent=1)
def make_genepresence_alignment(path): ''' loop over all gene clusters and append 0/1 to strain specific string used as pseudo alignment of gene presence absence ''' geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/') output_path = '%s%s' % (path, 'geneCluster/') ## load strain list and prepare for gene presence/absence strain_list = load_pickle(path + 'strain_list.cpk') set_totalStrain = set([istrain for istrain in strain_list]) totalStrain = len(set_totalStrain) dt_strainGene = defaultdict(list) sorted_genelist = load_sorted_clusters(path) ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...] for gid, (clusterID, gene) in enumerate(sorted_genelist): gene_list = gene[1] ## append 0/1 to each strain dt_strainGene = create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene_list) with open(output_path + 'genePresence.aln', 'wb') as presence_outfile: for istkey in dt_strainGene: dt_strainGene[istkey] = ''.join(dt_strainGene[istkey]) write_in_fa(presence_outfile, istkey, dt_strainGene[istkey]) write_pickle(output_path + 'dt_genePresence.cpk', dt_strainGene)
def update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt): ## update gene cluster pickled file cluster_path = path + 'protein_faa/diamond_matches/' diamond_geneCluster_dt.update(diamond_RNACluster_dt) write_pickle(cluster_path + 'orthamcl-allclusters_final.cpk', diamond_geneCluster_dt)
def diamond_input(path, strain_lst, disable_RNA_clustering=0): ''' go through all GenBank files and extract sequences and metadata for each one ''' each_gbk_path = '%s%s' % (path, 'input_GenBank/') os.system('mkdir %s;mv %s*gbk %s' % (each_gbk_path, path, each_gbk_path)) protein_folder = '%s%s' % (path, 'protein_faa/') os.system('mkdir %s' % protein_folder) nucleotide_dict_path = '%s%s' % (path, 'nucleotide_fna/') os.system('mkdir %s' % nucleotide_dict_path) RNA_folder = '%s%s' % (path, 'RNA_fna/') os.system('mkdir %s' % RNA_folder) ## CDS geneID_to_geneSeqID_file = path + 'geneID_to_geneSeqID.cpk' geneID_to_geneSeqID_dict = defaultdict() geneID_to_description_file = path + 'geneID_to_description.cpk' geneID_to_description_dict = defaultdict() ## RNA RNAID_to_SeqID_file = path + 'RNAID_to_SeqID.cpk' RNAID_to_SeqID_dict = defaultdict() RNAID_to_description_file = path + 'RNAID_to_description.cpk' RNAID_to_description_dict = defaultdict() for strain_name in strain_lst: diamond_input_fname = protein_folder + '%s%s' % (strain_name, '.faa') RNA_blast_input_fname = RNA_folder + '%s%s' % (strain_name, '.fna') gbk_translation(each_gbk_path, nucleotide_dict_path, '%s%s' % (strain_name, '.gbk'), diamond_input_fname, RNA_blast_input_fname, geneID_to_geneSeqID_dict, geneID_to_description_dict, RNAID_to_SeqID_dict, RNAID_to_description_dict, disable_RNA_clustering) write_pickle(geneID_to_geneSeqID_file, geneID_to_geneSeqID_dict) write_pickle(geneID_to_description_file, geneID_to_description_dict) if disable_RNA_clustering == 0: write_pickle(RNAID_to_SeqID_file, RNAID_to_SeqID_dict) write_pickle(RNAID_to_description_file, RNAID_to_description_dict)
def parse_RNACluster(path, inputfile): """ store clusters as dictionary in cpk file """ from operator import itemgetter inputfile = "%s%s" % (path, inputfile) with open(inputfile, 'rb') as infile: RNACluster_dt = defaultdict(list) for gid, iline in enumerate( infile): ##format: NC_022226|1-1956082:1956435 col = iline.rstrip().split('\t') clusterID = "GC_RNA%03d" % gid RNACluster_dt[clusterID] = [0, [], 0] ## num_stains RNACluster_dt[clusterID][0] = len( dict(Counter([ivg.split('|')[0] for ivg in col])).keys()) ## num_RNAs RNACluster_dt[clusterID][2] = len( dict(Counter([ivg for ivg in col])).keys()) ## RNA members RNACluster_dt[clusterID][1] = [icol for icol in col] write_pickle(path + 'orthamcl-allclusters.cpk', RNACluster_dt)
def parse_geneCluster(path,inputfile, cluster_log=False): """ store clusters as dictionary in cpk file """ from operator import itemgetter inputfile="%s%s"%(path,inputfile) with open(inputfile, 'rb') as infile: geneCluster_dt=defaultdict(list) for gid, iline in enumerate(infile): ##format: NC_022226|1-1956082:1956435 col=iline.rstrip().split('\t') clusterID="GC_%08d"%gid geneCluster_dt[clusterID]=[0,[],0] ## num_stains geneCluster_dt[clusterID][0]=len(dict(Counter([ ivg.split('|')[0] for ivg in col])).keys()) ## num_genes geneCluster_dt[clusterID][2]=len(dict(Counter([ ivg for ivg in col])).keys()) ## gene members geneCluster_dt[clusterID][1]=[ icol for icol in col ] write_pickle(path+'orthamcl-allclusters.cpk',geneCluster_dt) if cluster_log==True: with open(path+'orthamcl-allclusters.log', 'wb') as write_fn_lst: orthagogue_geneCount_lst=sorted( geneCluster_dt.iteritems(), key=itemgetter(1), reverse=True); for kd, vd in orthagogue_geneCount_lst: write_fn_lst.write('%s%s\n'%(kd, vd));
def load_strains(): """ load input strains in strain_list """ if os.path.isfile(path+strain_list): with open(path+strain_list,'rb') as infile: write_pickle(path+'strain_list.cpk', [ ist.rstrip().split('.gbk')[0] for ist in infile] )
def create_core_SNP_matrix(path): """ create SNP matrix using core gene SNPs input: strain_list.cpk, core_geneList.cpk output: SNP_whole_matrix.aln """ import os, sys, operator import numpy as np from collections import defaultdict from SF00_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa alnFilePath = '%s%s' % (path, 'geneCluster/') output_path = alnFilePath ## create core gene list corelist = [] totalStrain = len(load_pickle(path + 'strain_list.cpk')) sorted_geneList = load_sorted_clusters(path) with open(output_path + 'core_geneList.txt', 'wb') as outfile: for clusterID, vg in sorted_geneList: if vg[0] == totalStrain and vg[2] == totalStrain: coreGeneName = '%s%s' % (clusterID, '_na.aln') ## sequences might be discarded because of premature stops coreGeneName_path = alnFilePath + coreGeneName if os.path.exists(coreGeneName_path) and len( read_fasta(coreGeneName_path)) == totalStrain: outfile.write(coreGeneName + '\n') corelist.append(coreGeneName) else: print '%s%s%s' % ('warning: ', coreGeneName_path, ' is not a core gene') write_pickle(output_path + 'core_geneList.cpk', corelist) refSeqList = load_pickle(path + 'strain_list.cpk') refSeqList.sort() snp_fre_lst = [] snp_wh_matrix_flag = 0 snp_pos_dt = defaultdict(list) snp_whole_matrix = np.array([]) snps_by_gene = [] for align_file in corelist: ## all core genes fa_dt = read_fasta(alnFilePath + align_file) fa_sorted_lst = sorted(fa_dt.items(), key=lambda x: x[0].split('|')[0]) nuc_array = np.array([]) flag = 0 for ka, va in enumerate(fa_sorted_lst): if flag == 0: flag = 1 nuc_array = np.array(np.fromstring(va[1], dtype='S1')) else: nuc_array = np.vstack( (nuc_array, np.fromstring(va[1], dtype='S1'))) position_polymorphic = np.where( np.all(nuc_array == nuc_array[0, :], axis=0) == False)[0] position_has_gap = np.where(np.any(nuc_array == '-', axis=0))[0] position_SNP = np.setdiff1d(position_polymorphic, position_has_gap) snp_columns = nuc_array[:, position_SNP] snp_pos_dt[align_file] = position_SNP if snp_wh_matrix_flag == 0: snp_whole_matrix = snp_columns snp_wh_matrix_flag = 1 else: snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns)) write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt) with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile: for ind, isw in enumerate(snp_whole_matrix): write_in_fa(outfile, refSeqList[ind], isw.tostring())
def diamond_orthamcl_cluster(path, threads, blast_cluster_file_path='none', roary_cluster_file_path='none',diamond_orthamcl_cluster='600', mcl_inflation=2.0): ''' make all-against-all comparison using diamond THEN generate gene clusters followed by orthoMCL/orthagogue+MCL OR use the output of all-to-all blast comparison and orthoMCL/orthagogue+MCL OR use the output of roary params: path: path to directory including data and output threads: number of parallel threads used to run diamond blast_cluster_file_path: gene clusters by all-to-all blast comparison and orthoMCL/orthagogue+MCL roary_cluster_file_path: gene clusters by roary diamond_orthamcl_cluster: Diamond setting: the maximum number of target sequences per query to keep alignments for. Defalut: #strain * #max_duplication= 40*15= 600 ''' for exe in ['orthAgogue', 'mcl']: check_exe(exe) input_path=path+'protein_faa/'; output_path=input_path+'diamond_matches/'; threads=str(threads) ## using standard pipeline (roary_cluster_file_path=='none') if roary_cluster_file_path=='none': if blast_cluster_file_path=='none': dmd_ref_file='reference.faa'; dmd_query_file='query.faa' ## prepare dmd_query_file os.system('mkdir '+output_path) os.system('cat '+input_path+'*faa > '+output_path+dmd_query_file) ## dmd_query_file is dmd_ref_file os.system('cp '+output_path+dmd_query_file+' '+output_path+dmd_ref_file) diamond_run(output_path, output_path, dmd_ref_file, threads, diamond_orthamcl_cluster) ortha_mcl_run(output_path, threads, mcl_inflation) ## save singeltons origin_cluster_file='orthamcl-cluster.output'; orthagogue_singletons(output_path,origin_cluster_file,dmd_query_file) ## clean up diamond_query_file os.system(''.join(['rm ',output_path,'*faa'])) all_cluster_file='orthamcl-allclusters.csv'; parse_geneCluster(output_path,all_cluster_file) else: ## using user-given cluster file based on blast os.system('mkdir %s'%output_path) os.system('ln -sf %s %sclustered_proteins'%(blast_cluster_file_path, output_path)) from operator import itemgetter ## create gene cluster from blast output with open(blast_cluster_file_path, 'rb') as infile: geneCluster_dt=defaultdict(list) for gid, iline in enumerate(infile): column=[ ico.replace('_','|') for ico in iline.rstrip().split('\t') ] clusterID="GC_%08d"%gid gene_list=[ ico for ico in column ] geneCluster_dt[clusterID]=[0,[],0] num_stains=len( dict(Counter([ ivg.split('|')[0] for ivg in gene_list ])) ) num_gene=len(dict(Counter([ ivg for ivg in column]))) geneCluster_dt[ clusterID ][0]=num_stains geneCluster_dt[ clusterID ][2]=num_gene geneCluster_dt[ clusterID ][1]=gene_list write_pickle(output_path+'orthamcl-allclusters.cpk', geneCluster_dt) orthagogue_geneCount_lst=sorted( geneCluster_dt.iteritems(), key=itemgetter(1), reverse=True) with open(output_path+'orthamcl-allclusters.log', 'wb') as write_fn_lst: for kd, vd in orthagogue_geneCount_lst: write_fn_lst.write('%s%s\n'%(kd, vd)) else: ## using cluster files from roary os.system('mkdir %s'%output_path) os.system('ln -sf %s %sclustered_proteins'%(roary_cluster_file_path, output_path)) with open(roary_cluster_file_path, 'rb') as cluster_external_file: with open(output_path+'orthamcl-allclusters.csv', 'wb') as cluster_final_file: for cluster_line in cluster_external_file: cluster_final_file.write( '%s\n'%'\t'.join([ gene_tag.replace('_','|') if '|' not in gene_tag else gene_tag for gene_tag in cluster_line.rstrip().split(': ')[1].split('\t')]) ) all_cluster_file='orthamcl-allclusters.csv'; parse_geneCluster(output_path,all_cluster_file)
def update_diversity_cpk_file(path): ## write gene_diversity_Dt cpk file output_path = path+'geneCluster/' with open(output_path+'gene_diversity.txt', 'rb') as infile: write_pickle(output_path+'gene_diversity.cpk',{ i.rstrip().split('\t')[0]:i.rstrip().split('\t')[1] for i in infile})
def gbk_translation(each_gbk_path, nucleotide_dict_path, gb_file, output_filename, output_filename2, geneID_to_geneSeqID_dict, geneID_to_description_dict, RNAID_to_SeqID_dict, RNAID_to_description_dict, disable_RNA_clustering): ''' extract sequences and meta informations of all genes in one reference genbank file params: - each_gbk_path: path to the set of reference sequences used to construct the core genome - nucleotide_dict_path: path to the cPickled dicts of all nucleotide sequences for each genome - gb_file: name of the reference to be analyzed - output_filename: file into which all amino acid sequences are written in fasta format. needed as input for diamond - output_filename2: RNA nucleotide_sequences are written in fasta format. Needed as RNA_blast_input - geneID_to_geneSeqID_dict: dictionary linking geneID to gene sequence ID modified in place (key: geneID; value: geneSeqID ) - geneID_to_description_dict: dictionary linking geneID to description info modified in place (key: geneID; value: a dict including information on contig_index, annotation or more) - RNAID_to_SeqID_dict: dictionary linking RNAID to RNA sequence ID modified in place (key: RNAID; value: SeqID ) - RNAID_to_description_dict: dictionary linking RNAID to description info modified in place (key: RNAID; value: a dict including information on contig_index, annotation or more) - disable_RNA_clustering: not cluster rRNA and tRNA (default: 0 -> cluster RNAs) ''' reference_gb = '%s%s' % (each_gbk_path, gb_file) strainName = gb_file.split('.gbk')[0] gene_nuc_seq_dict = '%s%s_gene_nuc_dict.cpk' % (nucleotide_dict_path, strainName) gene_nucleotide_sequences = defaultdict() aa_sequence_file = open(output_filename, 'wb') if disable_RNA_clustering == 0: RNA_nuc_seq_dict = '%s%s_RNA_nuc_dict.cpk' % (nucleotide_dict_path, strainName) RNA_nucleotide_sequences = defaultdict() RNA_sequence_file = open(output_filename2, 'wb') contig_index = 0 for contig in SeqIO.parse(reference_gb, 'genbank'): contig_index += 1 for feature in contig.features: if feature.type == 'CDS': if 'product' in feature.qualifiers and 'translation' in feature.qualifiers: if 'gene' in feature.qualifiers: geneName = '%s' % ( feature.qualifiers['gene'][0]).replace(' ', '_') else: geneName = '' product = feature.qualifiers['product'][0] annotation = '_'.join(product.split(' ')) trans_seq = feature.qualifiers['translation'][0] locus_tag = feature.qualifiers['locus_tag'][0] if "PROKKA" in locus_tag: locus_tag = locus_tag.replace('PROKKA_', '') if '%s_' % strainName in locus_tag: locus_tag = locus_tag.split('%s_' % strainName)[1] ## geneID is composed of strain_name and locus_tag ## Keeping '|' separator is important, which is used later in orthAgogue. geneID = '%s|%s' % (strainName, locus_tag) write_in_fa(aa_sequence_file, geneID, trans_seq) # give tag 'gname:' to genes which have gene name and separate it from annotation geneID_to_description_dict[geneID] = { 'geneName': geneName, 'contig': contig_index, 'annotation': annotation } if geneName != '': geneName = '%s_' % geneName geneID_to_geneSeqID_dict[geneID] = '%s|%s-%d-%s%s' % ( strainName, locus_tag, contig_index, geneName, annotation) gene_nucleotide_sequences[geneID] = feature.extract( contig.seq) elif not disable_RNA_clustering and (feature.type == 'rRNA' or feature.type == 'tRNA'): if 'product' in feature.qualifiers: geneName = '' product = feature.qualifiers['product'][0] annotation = '_'.join(product.split(' ')) locus_tag = feature.qualifiers['locus_tag'][0] if "PROKKA" in locus_tag: locus_tag = locus_tag.replace('PROKKA_', '') if '%s_' % strainName in locus_tag: locus_tag = locus_tag.split('%s_' % strainName)[1] ## RNA is composed of strain_name and locus_tag ## Keeping '|' separator is important, which is used later in orthAgogue. RNAID = '%s|%s' % (strainName, locus_tag) RNA_seq = str(feature.extract(contig.seq)) write_in_fa(RNA_sequence_file, RNAID, RNA_seq) # give tag 'gname:' to genes which have gene name and separate it from annotation RNAID_to_description_dict[RNAID] = { 'geneName': '', 'contig': contig_index, 'annotation': annotation } RNAID_to_SeqID_dict[RNAID] = '%s|%s-%d-%s%s' % ( strainName, locus_tag, contig_index, geneName, annotation) RNA_nucleotide_sequences[RNAID] = RNA_seq write_pickle(gene_nuc_seq_dict, gene_nucleotide_sequences) if disable_RNA_clustering == 0: write_pickle(RNA_nuc_seq_dict, RNA_nucleotide_sequences) aa_sequence_file.close()