def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path, simple_tree):
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for RNA_cluster_nu_filename in fa_files_list:
        try:
            # extract GC_RNA002 from path/GC_RNA002.aln
            clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0]
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(RNA_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na.aln
                RNA_cluster_nu_aln_filename= RNA_cluster_nu_filename.replace('.fna','_na.aln')
                ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(RNA_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(RNA_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))
            else: # align and build tree
                print RNA_cluster_nu_filename
                myTree = mpm_tree(RNA_cluster_nu_filename)
                myTree.align()

                if simple_tree==False:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine(CDS=False)
                else:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path, RNA_specific=True)

                RNA_diversity_values='{0:.3f}'.format(myTree.diversity_nuc)
                geneDiversity_file.write('%s\t%s\n'%(clusterID,RNA_diversity_values))
                print clusterID,RNA_diversity_values
        except:
            print("Aligning and tree building of RNA %s failed"%RNA_cluster_nu_filename)
def align_and_makeTree( fna_file_list, alignFile_path, simple_tree):
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for gene_cluster_nu_filename in fna_file_list:
        try:
            # extract GC_00002 from path/GC_00002.aln
            clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0]
            start = time.time();
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na_aln.fa
                gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_nu_aln_filename,gene_cluster_nu_aln_filename.replace('_aln','_aln_reduced')]) )

                ## aa_aln.fa
                gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa')
                gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_aa_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_aa_aln_filename,gene_cluster_aa_aln_filename.replace('_aln','_aln_reduced')]) )

                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))
            else: # align and build tree
                #print gene_cluster_nu_filename
                myTree = mpm_tree(gene_cluster_nu_filename)
                myTree.codon_align()
                myTree.translate()
                if simple_tree==False:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine()
                else:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path)
                #myTree.diversity_statistics_aa()
                #random_alnID=myTree.seqs.keys()[0].split('-')[0]
                diversity_nuc= round(myTree.diversity_nuc,3)#diversity_aa=round(myTree.diversity_aa,3)
                #bestSplit_paraNodes,bestSplit_branchLen = myTree.paralogy_statistics()
                #mean_seqLen, std_seqLen=  myTree.mean_std_seqLen()
                #mean_seqLen, std_seqLen= [ round(i,3) for i in mean_seqLen, std_seqLen ]
                geneDiversity_file.write('%s\t%s\n'%(clusterID,diversity_nuc))
                if 0:
                    cluster_correl_stats_file = open(alignFile_path+'cluster_correl_stats.txt', 'a')
                    cluster_correl_stats_file.write('%s\n'%'\t'.join([
                     str(i) for i in [clusterID, random_alnID, diversity_nuc, \
                        mean_seqLen, std_seqLen, bestSplit_paraNodes, bestSplit_branchLen ] ]))
        except:
            print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
def align_and_makeTree( fna_file_list, alignFile_path, simple_tree):
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for gene_cluster_nu_filename in fna_file_list:
        try:
            # extract GC_00002 from path/GC_00002.aln
            clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0]
            start = time.time();
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na_aln.fa
                gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_nu_aln_filename,gene_cluster_nu_aln_filename.replace('_aln','_aln_reduced')]) )

                ## aa_aln.fa
                gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa')
                gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_aa_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_aa_aln_filename,gene_cluster_aa_aln_filename.replace('_aln','_aln_reduced')]) )

                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))
            else: # align and build tree
                #print gene_cluster_nu_filename
                myTree = mpm_tree(gene_cluster_nu_filename)
                myTree.codon_align()
                myTree.translate()
                if simple_tree==False:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine()
                else:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path)
                #myTree.diversity_statistics_aa()
                #random_alnID=myTree.seqs.keys()[0].split('-')[0]
                diversity_nuc= round(myTree.diversity_nuc,3)#diversity_aa=round(myTree.diversity_aa,3)
                #bestSplit_paraNodes,bestSplit_branchLen = myTree.paralogy_statistics()
                #mean_seqLen, std_seqLen=  myTree.mean_std_seqLen()
                #mean_seqLen, std_seqLen= [ round(i,3) for i in mean_seqLen, std_seqLen ]
                geneDiversity_file.write('%s\t%s\n'%(clusterID,diversity_nuc))
                if 0:
                    cluster_correl_stats_file = open(alignFile_path+'cluster_correl_stats.txt', 'a')
                    cluster_correl_stats_file.write('%s\n'%'\t'.join([
                     str(i) for i in [clusterID, random_alnID, diversity_nuc, \
                        mean_seqLen, std_seqLen, bestSplit_paraNodes, bestSplit_branchLen ] ]))
        except:
            print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
def build_representative_cluster(clustering_path, threads, input_prefix):
    """ build representative cluster """
    start = time.time()
    cluster_file = ''.join([clustering_path, input_prefix, '_cluster.output'])
    representative_outputfile = ''.join(
        [clustering_path, input_prefix, '_representative', '.faa'])
    subproblem_seqs_path = '%ssubproblem_cluster_seqs/' % clustering_path
    subproblem_merged_faa = ''.join([clustering_path, input_prefix, '.faa'])
    subproblem_faa_dict = read_fasta(subproblem_merged_faa)
    with open(cluster_file, 'rb') as cluster_input:
        subproblem_geneCluster_dt = defaultdict(list)
        cluster_input_lines = [iline for iline in cluster_input]
        subproblem_geneCluster_dt = {}
        subproblem_run_number = input_prefix.split('subproblem_')[1]
        for gid, iline in enumerate(cluster_input_lines):  #cluster_input
            ## use time to avoid clusterID conflict
            clusterID = "GCs%s_%07d%s" % (subproblem_run_number, gid,
                                          time.strftime('%M%S', time.gmtime()))
            gene_ids = iline.rstrip().split('\t')
            subproblem_geneCluster_dt[clusterID] = gene_ids
            ## representative_seq
            representative_seq = subproblem_faa_dict[gene_ids[0]]
            ## write in representative strain
            with open(representative_outputfile, 'a') as representative_output:
                write_in_fa(representative_output, clusterID,
                            representative_seq)
        ## write subproblem_geneCluster_dt
        write_pickle(''.join([clustering_path, input_prefix, '_dicts.cpk']),
                     subproblem_geneCluster_dt)
    print 'build representative clusters for', input_prefix, ': ', times(
        start), '\n'
def gather_seq_length(faa_path):
    """ """
    seq_length_dt = defaultdict()
    for faa_file in glob.iglob(''.join([faa_path, '*faa'])):
        for gene_tag, seq in read_fasta(faa_file).iteritems():
            seq_length_dt[gene_tag] = len(seq)
    return seq_length_dt
def gather_seq_length(faa_path):
    """ """
    seq_length_dt=defaultdict()
    for faa_file in glob.iglob(''.join([faa_path,'*faa'])):
        for gene_tag, seq in read_fasta(faa_file).iteritems():
            seq_length_dt[gene_tag]=len(seq)
    return seq_length_dt
def build_representative_cluster(clustering_path, threads, input_prefix):
    """ build representative cluster """
    start = time.time()
    cluster_file= ''.join([clustering_path,input_prefix,'_cluster.output'])
    representative_outputfile= ''.join([clustering_path,input_prefix,'_representative','.faa'])
    subproblem_seqs_path= '%ssubproblem_cluster_seqs/'%clustering_path
    subproblem_merged_faa= ''.join([clustering_path,input_prefix,'.faa'])
    subproblem_faa_dict= read_fasta(subproblem_merged_faa)
    with open(cluster_file, 'rb') as cluster_input:
        subproblem_geneCluster_dt= defaultdict(list)
        cluster_input_lines= [iline for iline in cluster_input]
        subproblem_geneCluster_dt= {}
        subproblem_run_number= input_prefix.split('subproblem_')[1]
        for gid, iline in enumerate(cluster_input_lines):#cluster_input
            ## use time to avoid clusterID conflict
            clusterID= "GCs%s_%07d%s"%(subproblem_run_number, gid, time.strftime('%M%S',time.gmtime()))
            gene_ids= iline.rstrip().split('\t')
            subproblem_geneCluster_dt[clusterID]= gene_ids
            ## representative_seq
            representative_seq=subproblem_faa_dict[gene_ids[0]]
            ## write in representative strain
            with open(representative_outputfile, 'a') as representative_output:
                write_in_fa(representative_output, clusterID, representative_seq)
        ## write subproblem_geneCluster_dt
        write_pickle(''.join([clustering_path,input_prefix,'_dicts.cpk']), subproblem_geneCluster_dt)
    print 'build representative clusters for', input_prefix,': ', times(start), '\n'
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path,
                                         simple_tree):
    fasttree_name = 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for RNA_cluster_nu_filename in fa_files_list:
        if 1:  #try:
            # extract GC_RNA002 from path/GC_RNA002.aln
            clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0]
            geneDiversity_file = open(alignFile_path + 'gene_diversity.txt',
                                      'a')
            if len(read_fasta(RNA_cluster_nu_filename)
                   ) == 1:  # nothing to do for singletons
                ## na.aln
                RNA_cluster_nu_aln_filename = RNA_cluster_nu_filename.replace(
                    '.fna', '_na.aln')
                ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(RNA_cluster_nu_aln_filename, 'wb') as write_file:
                    for SeqID, Sequence in read_fasta(
                            RNA_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|', '-'),
                                    Sequence)
                geneDiversity_file.write('%s\t%s\n' % (clusterID, '0.0'))
            else:  # align and build tree
                print RNA_cluster_nu_filename
                myTree = mpm_tree(RNA_cluster_nu_filename)
                myTree.align()

                if simple_tree == False:
                    myTree.build(raxml=False,
                                 fasttree_program=fasttree_name,
                                 treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine()
                else:
                    myTree.build(raxml=False,
                                 fasttree_program=fasttree_name,
                                 treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path, RNA_specific=True)

                RNA_diversity_values = '{0:.3f}'.format(myTree.diversity_nuc)
                geneDiversity_file.write('%s\t%s\n' %
                                         (clusterID, RNA_diversity_values))
                print clusterID, RNA_diversity_values
        if 0:  #except:
            print("Aligning and tree building of RNA %s failed" %
                  RNA_cluster_nu_filename)
def make_gene_presence_absence_matrix(input_filepath):
    os.chdir(input_filepath)
    gene_order= ','.join([gene.rstrip() for gene, content in load_sorted_clusters('./')])

    with open('./geneCluster/genePresence.aln') as inputf,\
         open(output_filepath,'wb') as outputf:
        outputf.write('accession,%s\n'%gene_order)
        for strain, genes in read_fasta(inputf).iteritems():
            outputf.write('%s,%s\n'%(strain,','.join(genes)))
def find_and_merge_unclustered_genes(path,
                                     nstrains,
                                     window_size=5,
                                     strain_proportion=0.3,
                                     sigma_scale=3):
    """
    detect the unclustered genes and concatenate them
    params:
        nstrains: total number of strains
        window_size
        strain_proportion
        sigma_scale
    return:
        a dict with key of the merged cluster and value of
        a list of related unclustered cluster-name for deletion

    """
    file_path = '%s%s' % (path, 'geneCluster/')
    gene_clusters = load_sorted_clusters(path)
    length_to_cluster = defaultdict(list)
    length_list = []
    ## calculate cluster length distribution, link clusterIDs with their clusterLength
    for gid, (clusterID, gene) in enumerate(gene_clusters):
        # average length of the cluster in amino acids
        clusterLength = int(
            np.mean([
                len(igene)
                for igene in read_fasta(file_path + '%s%s' %
                                        (clusterID, '.fna')).values()
            ]) / 3.0)
        length_to_cluster[clusterLength].append(clusterID)
        length_list.append(clusterLength)
    cluster_length_distribution = np.bincount(length_list)

    ## calculate smoothed cluster length distribution
    window_size = 5
    window = np.ones(window_size, dtype=float) / window_size
    smoothed_length_distribution = np.convolve(cluster_length_distribution,
                                               window,
                                               mode='same')

    ## detect peaks
    peaks = (cluster_length_distribution -
             smoothed_length_distribution) > np.maximum(
                 strain_proportion * nstrains,
                 sigma_scale * np.sqrt(smoothed_length_distribution))
    position_peaks = np.where(peaks)[0]
    #cluster_len_peaks= position_peaks*3
    ## concatenate clusters with the same aver. length, return dict of these clusters
    merged_clusters_dict = defaultdict(dict)
    for index, i_peak in enumerate(position_peaks, 1):
        merged_cluster_filename, cluster_needed_deletion = concatenate_cluster_files(
            length_to_cluster[i_peak], index, file_path)
        merged_clusters_dict[merged_cluster_filename] = cluster_needed_deletion
    return merged_clusters_dict
def calculate_aln_consensus(aln_file):
    """ """
    aln_dt= read_fasta(aln_file)
    alphabet = 'ACDEFGHIKLMNPQRSTVWY*-X'#alphabet = 'ACGT-N'
    if len(aln_dt) == 1:
        ## only one seq
        ## if letters not in alphabet:
        consensus_arr_seq=''.join([ ic if ic in alphabet else 'X' for ic in aln_dt.values()[0] ])
    else:
        ## consensus of multiple seqs
        try:
            aln_array = np.array([ i for i in aln_dt.values()])
            aln_array = aln_array.view('S1').reshape((aln_array.size, -1))
            af = np.zeros((len(alphabet), aln_array.shape[1]))
            for ai, state in enumerate(alphabet):
                af[ai] += (aln_array==state).mean(axis=0)
            ## assign invalid character to the last letter in alphabet (N for nuc or X for aa )
            af[-1] = 1.0 - af[:-1].sum(axis=0)
            consensus_arr_seq=''.join([ alphabet[ic] for ic in af.argmax(axis=0) ])
        except:
            print 'errors in calculating consensus seq: ', aln_file
    return consensus_arr_seq
def find_and_merge_unclustered_genes( path, nstrains, window_size=5, strain_proportion=0.3 , sigma_scale=3):
    """
    detect the unclustered genes and concatenate them
    params:
        nstrains: total number of strains
        window_size
        strain_proportion
        sigma_scale
    return:
        a dict with key of the merged cluster and value of
        a list of related unclustered cluster-name for deletion

    """
    file_path='%s%s'%(path,'geneCluster/')
    gene_clusters = load_sorted_clusters(path)
    length_to_cluster = defaultdict(list)
    length_list = []
    ## calculate cluster length distribution, link clusterIDs with their clusterLength
    for gid, (clusterID, gene) in enumerate(gene_clusters):
        # average length of the cluster in amino acids
        clusterLength= int(np.mean([len(igene) for igene in read_fasta(file_path+'%s%s'%(clusterID,'.fna')).values()])/3.0)
        length_to_cluster[clusterLength].append(clusterID)
        length_list.append(clusterLength)
    cluster_length_distribution = np.bincount(length_list)

    ## calculate smoothed cluster length distribution
    window_size=5
    window = np.ones(window_size, dtype=float)/window_size
    smoothed_length_distribution = np.convolve(cluster_length_distribution, window, mode='same')

    ## detect peaks
    peaks = (cluster_length_distribution - smoothed_length_distribution)> np.maximum(strain_proportion*nstrains, sigma_scale*np.sqrt(smoothed_length_distribution))
    position_peaks =np.where(peaks)[0]; #cluster_len_peaks= position_peaks*3
    ## concatenate clusters with the same aver. length, return dict of these clusters
    merged_clusters_dict=defaultdict(dict)
    for index, i_peak in enumerate(position_peaks,1):
        merged_cluster_filename, cluster_needed_deletion=concatenate_cluster_files(length_to_cluster[i_peak], index,file_path)
        merged_clusters_dict[merged_cluster_filename]=cluster_needed_deletion
    return merged_clusters_dict
def calculate_aln_consensus(aln_file):
    """ """
    aln_dt = read_fasta(aln_file)
    alphabet = 'ACDEFGHIKLMNPQRSTVWY*-X'  #alphabet = 'ACGT-N'
    if len(aln_dt) == 1:
        ## only one seq
        ## if letters not in alphabet:
        consensus_arr_seq = ''.join(
            [ic if ic in alphabet else 'X' for ic in aln_dt.values()[0]])
    else:
        ## consensus of multiple seqs
        try:
            aln_array = np.array([i for i in aln_dt.values()])
            aln_array = aln_array.view('S1').reshape((aln_array.size, -1))
            af = np.zeros((len(alphabet), aln_array.shape[1]))
            for ai, state in enumerate(alphabet):
                af[ai] += (aln_array == state).mean(axis=0)
            ## assign invalid character to the last letter in alphabet (N for nuc or X for aa )
            af[-1] = 1.0 - af[:-1].sum(axis=0)
            consensus_arr_seq = ''.join(
                [alphabet[ic] for ic in af.argmax(axis=0)])
        except:
            print 'errors in calculating consensus seq: ', aln_file
    return consensus_arr_seq
def create_core_SNP_matrix(path,
                           core_cutoff=1.0,
                           core_gene_strain_fpath=''):  #1.0
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
        core_cutoff: percentage of strains used to decide whether a gene is core
            default: 1.0 (strictly core gene, which is present in all strains)
            customized: 0.9 ( soft core, considered as core if present in 90% of strains)
    """
    import os, sys, operator
    import numpy as np
    import numpy.ma as ma
    from collections import defaultdict
    from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath = '%s%s' % (path, 'geneCluster/')
    output_path = alnFilePath

    ## create core gene list
    corelist = []
    strain_list = load_pickle(path + 'strain_list.cpk')
    totalStrain = len(strain_list)
    sorted_geneList = load_sorted_clusters(path)
    if core_gene_strain_fpath != '':
        with open(core_gene_strain_fpath, 'rb') as core_gene_strain_file:
            core_strain_set = set(
                [i.rstrip().replace('-', '_') for i in core_gene_strain_file])
    with open(output_path + 'core_geneList.txt', 'wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if core_cutoff == 1.0:
                strain_core_cutoff = totalStrain
            else:
                strain_core_cutoff = int(totalStrain * core_cutoff)
            if vg[0] == vg[2] and vg[0] >= strain_core_cutoff:
                coreGeneName = '%s%s' % (clusterID, '_na_aln.fa')
                ## sequences might be discarded because of premature stops
                coreGeneName_path = alnFilePath + coreGeneName
                if os.path.exists(coreGeneName_path) and len(
                        read_fasta(coreGeneName_path)) >= strain_core_cutoff:
                    if core_gene_strain_fpath != '' and len(
                            core_strain_set -
                            set([i.split('|')[0] for i in vg[1]])) != 0:
                        continue
                    outfile.write(coreGeneName + '\n')
                    corelist.append(coreGeneName)
                else:
                    #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene')
                    pass

        write_pickle(output_path + 'core_geneList.cpk', corelist)

    refSeqList = load_pickle(path + 'strain_list.cpk')
    refSeqList.sort()

    snp_fre_lst = []
    snp_wh_matrix_flag = 0
    snp_pos_dt = defaultdict(list)
    snp_whole_matrix = np.array([])
    snps_by_gene = []
    for align_file in corelist:  ## core genes
        nuc_array = np.array([])  # array to store nucleotides for each gene
        gene_seq_dt = read_fasta(alnFilePath + align_file)
        if core_cutoff != 1.0:
            # set sequences for missing gene (space*gene_length)
            missing_gene_seq = ' ' * len(gene_seq_dt.values()[0])
            totalStrain_sorted_lst = sorted(strain_list)
        # build strain_seq_dt from gene_seq_dt
        strain_seq_dt = defaultdict()
        for gene, seq in gene_seq_dt.iteritems():
            strain_seq_dt[gene.split('-')[0]] = seq  # strain-locus_tag-...
        strain_seq_sorted_lst = sorted(strain_seq_dt.items(),
                                       key=lambda x: x[0])

        start_flag = 0
        if core_cutoff == 1.0:
            for ka, va in strain_seq_sorted_lst:
                if start_flag == 0:
                    nuc_array = np.array(np.fromstring(va, dtype='S1'))
                    start_flag = 1
                else:
                    nuc_array = np.vstack(
                        (nuc_array, np.fromstring(va, dtype='S1')))
            ## find SNP positions
            position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis=0)
            position_has_gap = np.any(nuc_array == '-', axis=0)
            position_SNP = position_polymorphic & (~position_has_gap)
            snp_columns = nuc_array[:, position_SNP]
            snp_pos_dt[align_file] = np.where(position_SNP)[0]
        else:
            ## add '-' for missing genes when dealing with soft core genes
            core_gene_strain = [gene for gene in strain_seq_dt.keys()]
            for strain in totalStrain_sorted_lst:
                if start_flag == 0:
                    if strain in core_gene_strain:
                        nuc_array = np.array(
                            np.fromstring(strain_seq_dt[strain], dtype='S1'))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s' % (
                            strain, align_file)
                        nuc_array = np.array(
                            np.fromstring(missing_gene_seq, dtype='S1'))
                    start_flag = 1
                else:
                    if strain in core_gene_strain:
                        nuc_array = np.vstack(
                            (nuc_array,
                             np.fromstring(strain_seq_dt[strain], dtype='S1')))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s' % (
                            strain, align_file)
                        nuc_array = np.vstack((nuc_array,
                                               np.fromstring(missing_gene_seq,
                                                             dtype='S1')))
            ## find SNP positions
            ## mask missing genes -- determine rows that have ' ' in every column
            is_missing = np.all(nuc_array == ' ', axis=1)
            masked_non_missing_array = np.ma.masked_array(
                nuc_array, nuc_array == ' ')
            position_polymorphic = np.any(
                masked_non_missing_array != masked_non_missing_array[0, :],
                axis=0)
            position_has_gap = np.any(masked_non_missing_array == '-', axis=0)
            position_SNP = position_polymorphic & (~position_has_gap)
            # the below seems duplicated from 5 lines above??
            if is_missing.sum() > 0:  # with missing genes
                nuc_array[is_missing] = '-'
            snp_columns = nuc_array[:, position_SNP]
            snp_pos_dt[align_file] = np.where(position_SNP)[0]
            #print snp_columns

        if snp_wh_matrix_flag == 0:
            snp_whole_matrix = snp_columns
            snp_wh_matrix_flag = 1
        else:
            snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns))
    write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt)

    with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa(outfile, refSeqList[ind], isw.tostring())
def create_split_cluster_files(file_path, fname, gene_list1, gene_list2,
                               geneCluster_dt):
    """
    delete the old cluster and create two new clusters
    params:
        new_fa_files: list to which new file names are appeneded
        gene_list1/2: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
    """
    orgin_nwk_name = fname.split('/')[-1]
    clusterID = orgin_nwk_name.replace('.nwk', '')
    origin_cluster_nu_fa = orgin_nwk_name.replace('nwk', 'fna')
    origin_cluster_aa_fa = orgin_nwk_name.replace('nwk', 'faa')

    split_fa_files_set = set()
    ## load genes from old clusters
    origin_nu_fa_dt = read_fasta(file_path + origin_cluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path + origin_cluster_aa_fa)
    sgs_index = 0

    ## delete old (split) clusters
    try:
        #print('deleting:',orgin_nwk_name)
        ##debug:
        ##print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID)
        del geneCluster_dt[clusterID]
        with open(file_path + 'old_clusters_paralogSplit.txt',
                  'a') as delete_cluster_file:
            delete_cluster_file.write('%s\n' % clusterID)
        if os.path.exists(fname):
            suffix_list = [
                '_aa_aln.fa', '_na_aln.fa', '.fna', '.faa', '.nwk',
                '_tree.json'
            ]
        else:
            suffix_list = ['_aa_aln.fa', '_na_aln.fa', '.fna', '.faa']
        tmp_files = ' '.join(
            [file_path + clusterID + suffix for suffix in suffix_list])
        command_move_deleted_clusters = ' '.join(
            ['mv', tmp_files, file_path + 'paralog_splits/'])
        os.system(command_move_deleted_clusters)
    except:
        print("paralog splitting: can't delete", orgin_nwk_name)
        ##debug:
        ##print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID)

    ## write new cluster fa files
    ## split_gene_list has geneSeqID instead of geneID
    for split_gene_list in (list(gene_list1), list(gene_list2)):
        sgs_index += 1
        newClusterId = "%s_p%s" % (clusterID, sgs_index)
        gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna')
        gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa')
        gene_cluster_nu_write = open(file_path + gene_cluster_nu_filename,
                                     'wb')
        gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename,
                                     'wb')

        split_fa_files_set |= set([file_path + gene_cluster_nu_filename])

        ## write new split cluster files
        for gene_memb in split_gene_list:
            if "\\'" in gene_memb:
                gene_memb = gene_memb.replace("\\'", "'")
            try:
                write_in_fa(gene_cluster_nu_write, gene_memb,
                            origin_nu_fa_dt[gene_memb])
                write_in_fa(gene_cluster_aa_write, gene_memb,
                            origin_aa_fa_dt[gene_memb])
            except:
                print 'paralogy splitting (problem to write new split cluster files)', fname  #, gene_memb, gene_list1, gene_list2

        gene_cluster_nu_write.close()
        gene_cluster_aa_write.close()

        geneCluster_dt[newClusterId] = [0, [], 0]
        ## num_stains
        geneCluster_dt[newClusterId][0] = len(
            dict(Counter([ig.split('|')[0] for ig in split_gene_list])).keys())
        ## num_genes
        geneCluster_dt[newClusterId][2] = len(
            dict(Counter([ig for ig in split_gene_list])).keys())
        ## gene members
        geneCluster_dt[newClusterId][1] = [
            ig.split('-')[0] for ig in split_gene_list
        ]
    return split_fa_files_set
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''):#1.0
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
        core_cutoff: percentage of strains used to decide whether a gene is core
            default: 1.0 (strictly core gene, which is present in all strains)
            customized: 0.9 ( soft core, considered as core if present in 90% of strains)
    """
    import os,sys,operator
    import numpy as np
    import numpy.ma as ma
    from collections import defaultdict
    from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath='%s%s'%(path,'geneCluster/')
    output_path= alnFilePath

    ## create core gene list
    corelist=[]
    strain_list=load_pickle(path+'strain_list.cpk')
    totalStrain= len(strain_list)
    sorted_geneList = load_sorted_clusters(path)
    if core_gene_strain_fpath!='':
        with open(core_gene_strain_fpath,'rb') as core_gene_strain_file:
            core_strain_set= set([i.rstrip().replace('-','_') for i in core_gene_strain_file])
    with open(output_path+'core_geneList.txt','wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if core_cutoff==1.0:
                strain_core_cutoff=totalStrain
            else:
                strain_core_cutoff=int(totalStrain*core_cutoff)
            if vg[0]==vg[2] and vg[0]>=strain_core_cutoff:
                coreGeneName='%s%s'%(clusterID,'_na_aln.fa')
                ## sequences might be discarded because of premature stops
                coreGeneName_path= alnFilePath+coreGeneName
                if os.path.exists(coreGeneName_path) and len(read_fasta(coreGeneName_path)) >= strain_core_cutoff:
                    if core_gene_strain_fpath!='' and len(core_strain_set-set([i.split('|')[0] for i in vg[1]]))!=0:
                        continue
                    outfile.write(coreGeneName+'\n')
                    corelist.append(coreGeneName)
                else:
                    #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene')
                    pass

        write_pickle(output_path+'core_geneList.cpk',corelist)

    refSeqList=load_pickle(path+'strain_list.cpk');refSeqList.sort()

    snp_fre_lst=[]; snp_wh_matrix_flag=0
    snp_pos_dt=defaultdict(list); snp_whole_matrix=np.array([])
    snps_by_gene=[]
    for align_file in corelist:## core genes
        nuc_array=np.array([]) # array to store nucleotides for each gene
        gene_seq_dt=read_fasta(alnFilePath+align_file)
        if core_cutoff!=1.0:
            # set sequences for missing gene (space*gene_length)
            missing_gene_seq=' '*len(gene_seq_dt.values()[0])
            totalStrain_sorted_lst=sorted(strain_list)
        # build strain_seq_dt from gene_seq_dt
        strain_seq_dt=defaultdict()
        for gene, seq in gene_seq_dt.iteritems():
            strain_seq_dt[gene.split('-')[0]]=seq # strain-locus_tag-...
        strain_seq_sorted_lst=sorted(strain_seq_dt.items(), key=lambda x: x[0])

        start_flag=0
        if core_cutoff==1.0:
            for ka, va in strain_seq_sorted_lst:
                if start_flag==0:
                    nuc_array=np.array(np.fromstring(va, dtype='S1'))
                    start_flag=1
                else:
                    nuc_array=np.vstack((nuc_array,np.fromstring(va, dtype='S1')))
            ## find SNP positions
            position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis = 0)
            position_has_gap = np.any(nuc_array=='-', axis=0)
            position_SNP = position_polymorphic&(~position_has_gap)
            snp_columns = nuc_array[:,position_SNP]
            snp_pos_dt[align_file]=np.where(position_SNP)[0]
        else:
        ## add '-' for missing genes when dealing with soft core genes
            core_gene_strain=[ gene for gene in strain_seq_dt.keys()]
            for strain in totalStrain_sorted_lst:
                if start_flag==0:
                    if strain in core_gene_strain:
                        nuc_array=np.array(np.fromstring(strain_seq_dt[strain], dtype='S1'))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file)
                        nuc_array=np.array(np.fromstring(missing_gene_seq, dtype='S1'))
                    start_flag=1
                else:
                    if strain in core_gene_strain:
                        nuc_array=np.vstack((nuc_array,np.fromstring(strain_seq_dt[strain], dtype='S1')))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file)
                        nuc_array=np.vstack((nuc_array,np.fromstring(missing_gene_seq, dtype='S1')))
            ## find SNP positions
            ## mask missing genes -- determine rows that have ' ' in every column
            is_missing = np.all(nuc_array==' ',axis=1)
            masked_non_missing_array= np.ma.masked_array(nuc_array, nuc_array==' ')
            position_polymorphic = np.any(masked_non_missing_array!= masked_non_missing_array[0, :],axis = 0)
            position_has_gap = np.any(masked_non_missing_array=='-',axis=0)
            position_SNP = position_polymorphic&(~position_has_gap)
            # the below seems duplicated from 5 lines above??
            if is_missing.sum()>0: # with missing genes
                nuc_array[is_missing]='-'
            snp_columns = nuc_array[:,position_SNP]
            snp_pos_dt[align_file]=np.where(position_SNP)[0]
            #print snp_columns

        if snp_wh_matrix_flag==0:
            snp_whole_matrix=snp_columns;
            snp_wh_matrix_flag=1
        else:
            snp_whole_matrix=np.hstack((snp_whole_matrix, snp_columns))
    write_pickle(output_path+'snp_pos.cpk',snp_pos_dt)

    with open(output_path+'SNP_whole_matrix.aln','wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa( outfile, refSeqList[ind], isw.tostring() )
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel,
                                 core_cutoff, factor_core_diversity, species):
    """
    estimate core gene diversity before gene cluster alignment
    and cluster post-processing
    """
    totalStrain = len(strain_list)

    ## load clusters
    clustering_path = folders_dict['clustering_path']
    geneCluster_dt = load_pickle(clustering_path + 'allclusters.cpk')
    protein_path = folders_dict['protein_path']
    nucleotide_path = folders_dict['nucleotide_path']
    protein_dict_path = '%s%s' % (protein_path, 'all_protein_seq.cpk')
    nucleotide_dict_path = '%s%s' % (nucleotide_path, 'all_nucleotide_seq.cpk')
    tmp_core_seq_path = '%s%s' % (clustering_path, 'tmp_core/')
    ## load geneID_to_geneSeqID geneSeqID cpk file
    geneID_to_geneSeqID_dict = load_pickle(path + 'geneID_to_geneSeqID.cpk')

    ## create core gene list
    core_geneCluster_dt = defaultdict()
    # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
    for clusterID, cluster_stats in geneCluster_dt.iteritems():
        if core_cutoff == 1.0:
            strain_core_cutoff = totalStrain
        else:
            strain_core_cutoff = int(totalStrain * core_cutoff)
        ## check whether #genes == #strains and it's a core/soft-core gene
        if cluster_stats[0] == cluster_stats[
                2] and cluster_stats[0] >= strain_core_cutoff:
            core_geneCluster_dt[clusterID] = cluster_stats
    if os.path.exists(tmp_core_seq_path):
        os.system(''.join(['rm -rf ', tmp_core_seq_path]))
    os.system('mkdir %s' % tmp_core_seq_path)

    ## create dict storing all genes' translation
    if 0:
        gene_aa_dict = defaultdict(dict)
        for accession_id in strain_list:
            gene_aa_dict[accession_id] = read_fasta(''.join(
                [protein_path, accession_id, '.faa']))
        write_pickle(protein_dict_path, gene_aa_dict)

        ## create dict for all gene's nucleotide sequence
        gene_na_dict = defaultdict(dict)
        for accession_id in strain_list:
            gene_na_dict[accession_id] = read_fasta(''.join(
                [nucleotide_path, accession_id, '.fna']))
        write_pickle(nucleotide_dict_path, gene_na_dict)

    gene_aa_dict = load_pickle(protein_dict_path)
    gene_na_dict = load_pickle(nucleotide_dict_path)

    ## write nucleotide and amino-acid sequences for each gene cluster
    export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt,
                           geneID_to_geneSeqID_dict, gene_na_dict,
                           gene_aa_dict)

    tmp_fa_files = glob.glob(tmp_core_seq_path + "*.fna")
    multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path,
            species)

    calculated_core_diversity = tmp_average_core_diversity(tmp_core_seq_path)
    refined_core_diversity = round(
        (0.1 + factor_core_diversity * calculated_core_diversity) /
        (1 + factor_core_diversity * calculated_core_diversity), 4)
    print('factor used: ' + str(factor_core_diversity))
    print('average core genome diversity: ' + str(calculated_core_diversity))
    print(
        'defined core genome diversity cutoff for splitting long branches: ' +
        str(refined_core_diversity))

    ## move folder tmp_core to the central data folder
    new_clustering_path = '%stmp_core' % path
    if os.path.exists(new_clustering_path):
        os.system(''.join(['rm -r ', new_clustering_path]))
    os.system('mv %s %s' % (tmp_core_seq_path, path))
    return calculated_core_diversity, refined_core_diversity
def output_cutted_clusters(file_path,
                           uncluster_filename,
                           gene_list,
                           cut_branch_threshold,
                           treefile_used=None,
                           cut_leftover=None):
    """
    delete the unclustered file and create new clusters
    params:
        gene_list: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
        cut_leftover: flag to indicate whether there are the leftover nodes
            after cutting long branches. Default: empty.
    """
    clusterID = uncluster_filename.replace('.fna', '')
    origin_uncluster_nu_fa = uncluster_filename
    origin_uncluster_aa_fa = uncluster_filename.replace('fna', 'faa')

    new_fa_files = set()

    ## load origin cluster fa files
    origin_nu_fa_dt = read_fasta(file_path + origin_uncluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path + origin_uncluster_aa_fa)

    ## split_gene_list has geneSeqID instead of geneID
    for sgs_index, split_gene_list in enumerate(gene_list, 1):
        if cut_leftover == True:
            ## newClusterId for the rest genes (_r as identifier)
            newClusterId = "%s_r%s" % (clusterID, sgs_index)
        else:
            newClusterId = "%s_%s" % (clusterID, sgs_index)

        #=============================================
        ## write new divided/split cluster files
        gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna')
        gene_cluster_nu_filepath = file_path + gene_cluster_nu_filename
        gene_cluster_nu_write = open(gene_cluster_nu_filepath, 'wb')

        gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa')
        gene_cluster_aa_filepath = file_path + gene_cluster_aa_filename
        gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename,
                                     'wb')

        for gene_memb in split_gene_list:
            if "\\'" in gene_memb:  # Replace '\' in node name:
                ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID
                ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name
                ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError:
                ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'"
                gene_memb = gene_memb.replace("\\'", "'")

            write_in_fa(gene_cluster_nu_write, gene_memb,
                        origin_nu_fa_dt[gene_memb])
            write_in_fa(gene_cluster_aa_write, gene_memb,
                        origin_aa_fa_dt[gene_memb])
        gene_cluster_nu_write.close()
        gene_cluster_aa_write.close()
        #=============================================

        if cut_leftover == True:
            ## align the rest genes, build tree, cut long branches till nothing can be cutted.
            cutTree_outputCluster([gene_cluster_nu_filepath], file_path,
                                  cut_branch_threshold, treefile_used)
        else:
            ## record the misclusters to be deleted (already addressed in cutTree_outputCluster )
            ## it will output the same cluster several times
            #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file:
            #    delete_cluster_file.write('%s\n'%uncluster_filename)

            ## add record in new_clusters_longSplit.txt, which is used for align new clusters
            new_fa_files.add(gene_cluster_nu_filepath)

            ## write cluster statistics in folder update_long_branch_splits
            addin_geneCluster_dt = defaultdict(list)
            addin_geneCluster_dt[newClusterId] = [0, [], 0]
            ## num_stains
            addin_geneCluster_dt[newClusterId][0] = len(
                dict(Counter([ig.split('|')[0]
                              for ig in split_gene_list])).keys())
            ## num_genes
            addin_geneCluster_dt[newClusterId][2] = len(
                dict(Counter([ig for ig in split_gene_list])).keys())
            ## gene members
            addin_geneCluster_dt[newClusterId][1] = [
                ig.split('-')[0] for ig in split_gene_list
            ]
            ## cPickle new cluster statistics
            write_pickle(
                ''.join([
                    file_path, 'update_long_branch_splits/', newClusterId,
                    '.cpk'
                ]), addin_geneCluster_dt)

    ## write records in gene_diversity file
    with open(file_path + 'new_clusters_longSplit.txt',
              'a') as refined_cluster_file:
        for i in new_fa_files:
            refined_cluster_file.write('%s\n' % i)
Beispiel #19
0
def geneCluster_to_json(path, enable_RNA_clustering, store_locus_tag,
                        raw_locus_tag, optional_table_column):
    """
    create json file for gene cluster table visualzition
    input:  path to genecluster output
    output: geneCluster.json
    """
    # define path and make output directory
    geneCluster_path = '%s%s' % (path, 'geneCluster/')
    output_path = '%s%s' % (path, 'vis/')

    # open files
    geneClusterJSON_outfile = open(output_path + 'geneCluster.json', 'wb')
    ##store locus_tags in a separate file for large dataset
    if store_locus_tag:
        locus_tag_outfile = open(path + 'search_locus_tag.tsv', 'wb')

    ### load precomputed annotations, diversity, associations etc
    # load geneID_to_descriptions
    geneID_to_descriptions = load_pickle(path + 'geneID_to_description.cpk')

    if enable_RNA_clustering:
        # load RNAID_to_description_file
        geneID_to_descriptions.update(
            load_pickle(path + 'RNAID_to_description.cpk'))

    gene_diversity_Dt = load_pickle(geneCluster_path + 'gene_diversity.cpk')
    ## load gain/loss event count dictionary
    dt_geneEvents = load_pickle(geneCluster_path + 'dt_geneEvents.cpk')
    ## load association
    branch_associations_path = path + 'branch_association.cpk'
    if os.path.isfile(branch_associations_path):
        branch_associations = load_pickle(branch_associations_path)
    else:
        branch_associations = {}
    presence_absence_associations_path = path + 'presence_absence_association.cpk'
    if os.path.isfile(presence_absence_associations_path):
        presence_absence_associations = load_pickle(
            presence_absence_associations_path)
    else:
        presence_absence_associations = {}

    ## load list of clustered sorted by strain count
    sorted_genelist = load_sorted_clusters(path)

    geneClusterJSON_outfile.write('[')
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for gid, (clusterID, gene) in enumerate(sorted_genelist):
        strain_count, gene_list, gene_count = gene
        # #print strain_count, gene_count
        if gid != 0:  ## begin
            geneClusterJSON_outfile.write(',\n')

        ## annotation majority
        allAnn, majority_annotation = consolidate_annotation(
            path, gene_list, geneID_to_descriptions)

        ## geneName majority
        all_geneName, majority_geneName = consolidate_geneName(
            path, gene_list, geneID_to_descriptions)

        ## extract gain/loss event count
        gene_event = dt_geneEvents[gid]

        ## average length
        seqs = read_fasta(geneCluster_path + '%s%s' %
                          (clusterID, '.fna')).values()
        geneClusterLength = int(np.mean([len(igene) for igene in seqs]))

        ## msa
        #geneCluster_aln='%s%s'%(clusterID,'_aa.aln')
        geneCluster_aln = clusterID

        ## check for duplicates
        if gene_count > strain_count:
            duplicated_state = 'yes'
            dup_list = [ig.split('|')[0] for ig in gene_list]
            # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes
            # Counter({'g1': 2, 'g2': 1})
            dup_detail = ''.join([
                '%s#%s@' % (kd, vd)
                for kd, vd in Counter(dup_list).iteritems() if vd > 1
            ])[:-1]
        else:
            duplicated_state = 'no'
            dup_detail = ''

        ## locus_tag
        if raw_locus_tag:  # make a string of all locus tags [1] in igl.split('|')
            all_locus_tags = ' '.join([igl.split('|')[1] for igl in gene_list])
        else:  # in addition to locus tag, keep strain name (but replace '|')
            all_locus_tags = ' '.join(
                [igl.replace('|', '_') for igl in gene_list])

        ## optionally store locus tags to file, remove from geneClusterJSON
        if store_locus_tag:
            locus_tag_outfile.write('%s\t%s\n' % (clusterID, all_locus_tags))
            all_locus_tags = ''

        ## default cluster json fields
        cluster_json_line = [
            '"geneId":' + str(gid + 1), '"geneLen":' + str(geneClusterLength),
            '"count":' + str(strain_count), '"dupli":"' + duplicated_state +
            '"', '"dup_detail":"' + dup_detail + '"',
            '"ann":"' + majority_annotation + '"',
            '"msa":"' + geneCluster_aln + '"',
            '"divers":"' + gene_diversity_Dt[clusterID] + '"',
            '"event":"' + str(gene_event) + '"', '"allAnn":"' + allAnn + '"',
            '"GName":"' + majority_geneName + '"',
            '"allGName":"' + all_geneName + '"',
            '"locus":"' + all_locus_tags + '"'
        ]

        if optional_table_column:
            cluster_json_line.extend(
                optional_geneCluster_properties(gene_list,
                                                optional_table_column))
        if clusterID in branch_associations:
            cluster_json_line.extend(
                geneCluster_associations(branch_associations[clusterID],
                                         suffix='BA'))
        if clusterID in presence_absence_associations:
            cluster_json_line.extend(
                geneCluster_associations(
                    presence_absence_associations[clusterID], suffix='PA'))

        #write file
        cluster_json_line = ','.join(cluster_json_line)
        geneClusterJSON_outfile.write('{' + cluster_json_line + '}')

    # close files
    geneClusterJSON_outfile.write(']')
    geneClusterJSON_outfile.close()
    if store_locus_tag: locus_tag_outfile.close()
def create_split_cluster_files(file_path, fname,
    gene_list1, gene_list2, geneCluster_dt):
    """
    delete the old cluster and create two new clusters
    params:
        new_fa_files: list to which new file names are appeneded
        gene_list1/2: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
    """
    orgin_nwk_name = fname.split('/')[-1]
    clusterID = orgin_nwk_name.replace('.nwk','')
    origin_cluster_nu_fa = orgin_nwk_name.replace('nwk','fna')
    origin_cluster_aa_fa = orgin_nwk_name.replace('nwk','faa')

    split_fa_files_set=set()
    ## load genes from old clusters
    origin_nu_fa_dt = read_fasta(file_path+origin_cluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path+origin_cluster_aa_fa)
    sgs_index=0

    ## delete old (split) clusters
    try:
        #print('deleting:',orgin_nwk_name)
        ##debug:
        ##print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID)
        del geneCluster_dt[clusterID]
        with open(file_path+'old_clusters_paralogSplit.txt', 'a') as delete_cluster_file:
            delete_cluster_file.write('%s\n'%clusterID)
        if os.path.exists(fname):
            suffix_list=['_aa_aln.fa','_na_aln.fa','.fna','.faa','.nwk','_tree.json']
        else:
            suffix_list=['_aa_aln.fa','_na_aln.fa','.fna','.faa']
        tmp_files=' '.join([ file_path+clusterID+suffix for suffix in suffix_list ])
        command_move_deleted_clusters=' '.join(['mv', tmp_files, file_path+'paralog_splits/'])
        os.system(command_move_deleted_clusters)
    except:
        print("paralog splitting: can't delete",orgin_nwk_name)
        ##debug:
        ##print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID)

    ## write new cluster fa files
    ## split_gene_list has geneSeqID instead of geneID
    for split_gene_list in (list(gene_list1), list(gene_list2)):
        sgs_index+=1
        newClusterId="%s_p%s"%(clusterID,sgs_index)
        gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna')
        gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa')
        gene_cluster_nu_write=open( file_path+gene_cluster_nu_filename, 'wb')
        gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb')

        split_fa_files_set |=  set([file_path+gene_cluster_nu_filename])

        ## write new split cluster files
        for gene_memb in split_gene_list:
            if "\\'" in gene_memb:
                gene_memb=gene_memb.replace("\\'","'")
            try:
                write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb])
                write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb])
            except:
                print 'paralogy splitting (problem to write new split cluster files)', fname #, gene_memb, gene_list1, gene_list2

        gene_cluster_nu_write.close(); gene_cluster_aa_write.close();

        geneCluster_dt[ newClusterId ] = [0,[],0]
        ## num_stains
        geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys())
        ## num_genes
        geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys())
        ## gene members
        geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ]
    return split_fa_files_set
def output_cutted_clusters(file_path, uncluster_filename, gene_list, cut_branch_threshold, treefile_used=None, cut_leftover=None):
    """
    delete the unclustered file and create new clusters
    params:
        gene_list: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
        cut_leftover: flag to indicate whether there are the leftover nodes
            after cutting long branches. Default: empty.
    """
    clusterID = uncluster_filename.replace('.fna','')
    origin_uncluster_nu_fa = uncluster_filename
    origin_uncluster_aa_fa = uncluster_filename.replace('fna','faa')

    new_fa_files=set()

    ## load origin cluster fa files
    origin_nu_fa_dt = read_fasta(file_path+origin_uncluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path+origin_uncluster_aa_fa)

    ## split_gene_list has geneSeqID instead of geneID
    for sgs_index,split_gene_list in enumerate(gene_list,1):
        if cut_leftover==True:
            ## newClusterId for the rest genes (_r as identifier)
            newClusterId="%s_r%s"%(clusterID,sgs_index)
        else:
            newClusterId="%s_%s"%(clusterID,sgs_index)

        #=============================================
        ## write new divided/split cluster files
        gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna')
        gene_cluster_nu_filepath= file_path+gene_cluster_nu_filename
        gene_cluster_nu_write=open(gene_cluster_nu_filepath , 'wb')

        gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa')
        gene_cluster_aa_filepath= file_path+gene_cluster_aa_filename
        gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb')

        for gene_memb in split_gene_list:
            if "\\'" in gene_memb: # Replace '\' in node name:
                ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID
                ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name
                ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError:
                ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'"
                gene_memb=gene_memb.replace("\\'","'")

            write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb])
            write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb])
        gene_cluster_nu_write.close(); gene_cluster_aa_write.close();
        #=============================================

        if cut_leftover==True:
            ## align the rest genes, build tree, cut long branches till nothing can be cutted.
            cutTree_outputCluster([gene_cluster_nu_filepath],file_path, cut_branch_threshold, treefile_used)
        else:
            ## record the misclusters to be deleted (already addressed in cutTree_outputCluster )
            ## it will output the same cluster several times
            #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file:
            #    delete_cluster_file.write('%s\n'%uncluster_filename)

            ## add record in new_clusters_longSplit.txt, which is used for align new clusters
            new_fa_files.add(gene_cluster_nu_filepath)

            ## write cluster statistics in folder update_long_branch_splits
            addin_geneCluster_dt=defaultdict(list)
            addin_geneCluster_dt[ newClusterId ] = [0,[],0]
            ## num_stains
            addin_geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys())
            ## num_genes
            addin_geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys())
            ## gene members
            addin_geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ]
            ## cPickle new cluster statistics
            write_pickle(''.join([file_path,'update_long_branch_splits/', newClusterId,'.cpk']),addin_geneCluster_dt)

    ## write records in gene_diversity file
    with open(file_path+'new_clusters_longSplit.txt', 'a') as refined_cluster_file:
        for i in new_fa_files:
            refined_cluster_file.write('%s\n'%i)
def extract_sequences(path, strain_list, folders_dict, gbk_present, enable_RNA_clustering):
    '''
        go through all GenBank files and extract sequences and metadata for each one
    '''
    gbk_path= folders_dict['gbk_path']
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']
    RNA_path= folders_dict['RNA_path']

    geneID_to_geneSeqID_file= '%sgeneID_to_geneSeqID.cpk'%path
    geneID_to_description_file= '%sgeneID_to_description.cpk'%path
    RNAID_to_SeqID_file= '%sRNAID_to_SeqID.cpk'%path
    RNAID_to_description_file= '%sRNAID_to_description.cpk'%path

    protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk')
    nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk')
    RNA_dict_path= '%s%s'%(RNA_path,'all_RNA_seq.cpk')

    geneID_to_geneSeqID_dict= defaultdict()
    geneID_to_description_dict= defaultdict()
    RNAID_to_SeqID_dict= defaultdict()
    RNAID_to_description_dict= defaultdict()
    gene_aa_dict= defaultdict(dict)
    gene_na_dict= defaultdict(dict)
    RNA_dict= defaultdict(dict)

    if gbk_present:
        ## clean up folder when data from previous run exist.
        os.system('rm -rf '+protein_path+'*.faa')
        os.system('rm -rf '+nucleotide_path+'*.fna')
        missing_CDS_list=[] ## a list containing strains which have no CDS (if any)
        ## process gbk file
        for strainID in strain_list:
            gbk_fname= ''.join([gbk_path,strainID,'.gbk'])
            protein_fname= ''.join([protein_path,strainID,'.faa'])
            nucleotide_fname= ''.join([nucleotide_path,strainID,'.fna'])
            RNA_fname= ''.join([RNA_path,strainID,'.fna'])
            check_CDS_passed= gbk_translation(strainID, gbk_fname, protein_fname, nucleotide_fname, RNA_fname,
                geneID_to_geneSeqID_dict,geneID_to_description_dict,
                RNAID_to_SeqID_dict, RNAID_to_description_dict,
                gene_aa_dict, gene_na_dict, RNA_dict, enable_RNA_clustering)
            if not check_CDS_passed:
                missing_CDS_list.append(strainID)
        if len(missing_CDS_list)!=0:
            print 'Warning: no CDS found in the following genome/genomes, please double-check\n', missing_CDS_list
            exit()
    else:
        ## process fna/faa files if gbk files are not given.
        for strainID in strain_list:
            ## amino acid sequences
            protein_fname=''.join([protein_path,strainID,'.faa'])
            nucleotide_fname=''.join([nucleotide_path,strainID,'.fna'])
            aa_sequence_dt=read_fasta(protein_fname)
            na_sequence_dt=read_fasta(nucleotide_fname)
            ## prepare geneSeqID and description
            for geneID in aa_sequence_dt.keys():
                geneName, annotation= '',''
                geneID_to_geneSeqID_dict[geneID]=geneID
                geneID_to_description_dict[geneID]={'geneName': geneName,
                                                    'annotation': annotation}
                gene_aa_dict[strainID][geneID]=aa_sequence_dt[geneID]
                gene_na_dict[strainID][geneID]=na_sequence_dt[geneID]
    write_pickle(geneID_to_geneSeqID_file, geneID_to_geneSeqID_dict)
    write_pickle(geneID_to_description_file, geneID_to_description_dict)
    write_pickle(protein_dict_path,gene_aa_dict)
    write_pickle(nucleotide_dict_path,gene_na_dict)
    ## option: process RNA sequences for RNA_clustering
    if enable_RNA_clustering:
        write_pickle(RNA_dict_path,RNA_dict)
        write_pickle(RNAID_to_SeqID_file, RNAID_to_SeqID_dict)
        write_pickle(RNAID_to_description_file, RNAID_to_description_dict)
    return gene_aa_dict, gene_na_dict
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species):
    """
    estimate core gene diversity before gene cluster alignment
    and cluster post-processing
    """
    totalStrain= len(strain_list)

    ## load clusters
    clustering_path= folders_dict['clustering_path']
    geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk')
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']
    protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk')
    nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk')
    tmp_core_seq_path= '%s%s'%(clustering_path,'tmp_core/')
    ## load geneID_to_geneSeqID geneSeqID cpk file
    geneID_to_geneSeqID_dict= load_pickle(path+'geneID_to_geneSeqID.cpk')

    ## create core gene list
    core_geneCluster_dt= defaultdict()
    # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
    for clusterID, cluster_stats in geneCluster_dt.iteritems():
        if core_cutoff==1.0:
            strain_core_cutoff=totalStrain
        else:
            strain_core_cutoff=int(totalStrain*core_cutoff)
        ## check whether #genes == #strains and it's a core/soft-core gene
        if cluster_stats[0]==cluster_stats[2] and cluster_stats[0]>=strain_core_cutoff:
            core_geneCluster_dt[clusterID]=cluster_stats
    if os.path.exists(tmp_core_seq_path):
        os.system(''.join(['rm -rf ',tmp_core_seq_path]))
    os.system('mkdir %s'%tmp_core_seq_path)

    ## create dict storing all genes' translation
    if 0:
        gene_aa_dict= defaultdict(dict)
        for accession_id in strain_list:
            gene_aa_dict[accession_id]= read_fasta(''.join([protein_path,accession_id,'.faa']))
        write_pickle(protein_dict_path, gene_aa_dict)

        ## create dict for all gene's nucleotide sequence
        gene_na_dict= defaultdict(dict)
        for accession_id in strain_list:
            gene_na_dict[accession_id]=read_fasta(''.join([nucleotide_path,accession_id,'.fna']))
        write_pickle(nucleotide_dict_path, gene_na_dict)

    gene_aa_dict= load_pickle(protein_dict_path)
    gene_na_dict= load_pickle(nucleotide_dict_path)

    ## write nucleotide and amino-acid sequences for each gene cluster
    export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict,
        gene_na_dict, gene_aa_dict)

    tmp_fa_files=glob.glob(tmp_core_seq_path+"*.fna")
    multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species)

    calculated_core_diversity=tmp_average_core_diversity(tmp_core_seq_path)
    refined_core_diversity= round((0.1+factor_core_diversity*calculated_core_diversity)/(1+factor_core_diversity*calculated_core_diversity),4)
    print('factor used: '+str(factor_core_diversity))
    print('average core genome diversity: '+str(calculated_core_diversity))
    print('defined core genome diversity cutoff for splitting long branches: '+str(refined_core_diversity))

    ## move folder tmp_core to the central data folder
    new_clustering_path= '%stmp_core'%path
    if os.path.exists(new_clustering_path):
        os.system(''.join(['rm -r ',new_clustering_path]))
    os.system('mv %s %s'%(tmp_core_seq_path, path))
    return calculated_core_diversity, refined_core_diversity
def geneCluster_to_json(path, enable_RNA_clustering, store_locus_tag,
                        raw_locus_tag, optional_table_column):
    """
    create json file for gene cluster table visualzition
    input:  path to genecluster output
    output: geneCluster.json
    """
    # define path and make output directory
    geneCluster_path='%s%s'%(path,'geneCluster/')
    output_path='%s%s'%(path,'vis/')

    # open files
    geneClusterJSON_outfile=open(output_path+'geneCluster.json', 'wb')
    ##store locus_tags in a separate file for large dataset
    if store_locus_tag:
        locus_tag_outfile=open(path+'search_locus_tag.tsv', 'wb')


    ### load precomputed annotations, diversity, associations etc
    # load geneID_to_descriptions
    geneID_to_descriptions=load_pickle(path+'geneID_to_description.cpk')

    if enable_RNA_clustering:
        # load RNAID_to_description_file
        geneID_to_descriptions.update(load_pickle(path+'RNAID_to_description.cpk'))

    gene_diversity_Dt = load_pickle(geneCluster_path+'gene_diversity.cpk')
    ## load gain/loss event count dictionary
    dt_geneEvents = load_pickle(geneCluster_path+'dt_geneEvents.cpk')
    ## load association
    branch_associations_path = path+'branch_association.cpk'
    if os.path.isfile(branch_associations_path):
        branch_associations = load_pickle(branch_associations_path)
    else:
        branch_associations={}
    presence_absence_associations_path = path+'presence_absence_association.cpk'
    if os.path.isfile(presence_absence_associations_path):
        presence_absence_associations = load_pickle(presence_absence_associations_path)
    else:
        presence_absence_associations={}

    ## load list of clustered sorted by strain count
    sorted_genelist = load_sorted_clusters(path)

    geneClusterJSON_outfile.write('[')
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for gid, (clusterID, gene) in enumerate(sorted_genelist):
        strain_count, gene_list, gene_count = gene
        # #print strain_count, gene_count
        if gid!=0: ## begin
            geneClusterJSON_outfile.write(',\n')

        ## annotation majority
        allAnn, majority_annotation = consolidate_annotation(path, gene_list, geneID_to_descriptions)

        ## geneName majority
        all_geneName, majority_geneName =  consolidate_geneName(path, gene_list, geneID_to_descriptions)

        ## extract gain/loss event count
        gene_event= dt_geneEvents[gid]

        ## average length
        seqs = read_fasta(geneCluster_path+'%s%s'%(clusterID,'.fna')).values()
        geneClusterLength = int(np.mean([ len(igene) for igene in seqs]))

        ## msa
        #geneCluster_aln='%s%s'%(clusterID,'_aa.aln')
        geneCluster_aln=clusterID

        ## check for duplicates
        if gene_count>strain_count:
            duplicated_state='yes'
            dup_list=[ ig.split('|')[0] for ig in gene_list]
            # "#" to delimit (gene/gene_count)key/value ; "@" to seperate genes
            # Counter({'g1': 2, 'g2': 1})
            dup_detail=''.join(['%s#%s@'%(kd,vd) for kd, vd in Counter(dup_list).iteritems() if vd>1 ])[:-1]
        else:
            duplicated_state='no';dup_detail=''

        ## locus_tag
        if raw_locus_tag: # make a string of all locus tags [1] in igl.split('|')
            all_locus_tags=' '.join([ igl.split('|')[1] for igl in gene_list ])
        else: # in addition to locus tag, keep strain name (but replace '|')
            all_locus_tags=' '.join([ igl.replace('|','_') for igl in gene_list ])

        ## optionally store locus tags to file, remove from geneClusterJSON
        if store_locus_tag:
            locus_tag_outfile.write('%s\t%s\n'%(clusterID,all_locus_tags))
            all_locus_tags=''

        ## default cluster json fields
        cluster_json_line=['"geneId":'+str(gid+1),
                            '"geneLen":'+str(geneClusterLength),
                            '"count":'+str(strain_count),
                            '"dupli":"'+duplicated_state+'"',
                            '"dup_detail":"'+dup_detail+'"',
                            '"ann":"'+majority_annotation+'"',
                            '"msa":"'+geneCluster_aln+'"',
                            '"divers":"'+gene_diversity_Dt[clusterID]+'"',
                            '"event":"'+str(gene_event)+'"',
                            '"allAnn":"'+allAnn+'"',
                            '"GName":"'+majority_geneName+'"',
                            '"allGName":"'+all_geneName+'"',
                            '"locus":"'+all_locus_tags+'"'
                            ]

        if optional_table_column:
            cluster_json_line.extend(optional_geneCluster_properties(gene_list,optional_table_column))
        if clusterID in branch_associations:
            cluster_json_line.extend(geneCluster_associations(branch_associations[clusterID], suffix='BA'))
        if clusterID in presence_absence_associations:
            cluster_json_line.extend(geneCluster_associations(presence_absence_associations[clusterID], suffix='PA'))

        #write file
        cluster_json_line=','.join(cluster_json_line)
        geneClusterJSON_outfile.write('{'+cluster_json_line+'}')

    # close files
    geneClusterJSON_outfile.write(']')
    geneClusterJSON_outfile.close()
    if store_locus_tag: locus_tag_outfile.close()