def cluster_align_makeTree( path, folders_dict, parallel, disable_cluster_postprocessing, simple_tree):
    """
    create gene clusters as nucleotide/ amino_acid fasta files
    and build individual gene trees based on fna files
    """
    proc= multiprocessing.Process(target=create_geneCluster_fa, args=(path, folders_dict))
    proc.start(); proc.join()

    ## align, build_tree, make_geneTree_json
    cluster_seqs_path = path+'geneCluster/'
    if os.path.exists(cluster_seqs_path+'gene_diversity.txt'):
        os.system('rm '+cluster_seqs_path+'gene_diversity.txt')

    if 0:
        with open(cluster_seqs_path+'cluster_correl_stats.txt', 'wb') as cluster_correl_stats_file:
            cluster_correl_stats_file.write('%s\n'%'\t'.join(
                        ['clusterID', 'random_alnID', 'diversity_nuc', \
                        'mean_seqLen', 'std_seqLen', 'bestSplit_paraNodes', 'bestSplit_branchLen'
                        ]))

    fna_file_list=glob.glob(cluster_seqs_path+"*.fna")
    multips(align_and_makeTree, parallel, fna_file_list,
        cluster_seqs_path, simple_tree)

    ## if cluster_postprocessing skipped, rename allclusters.tsv and allclusters.cpk as the final cluster file
    if disable_cluster_postprocessing:
        update_diversity_cpk(path)
        clustering_path= '%s%s'%(path,'protein_faa/diamond_matches/')
        os.system('cp %sallclusters.tsv %sallclusters_final.tsv'%(clustering_path,clustering_path))
        os.system('cp %sallclusters.cpk %sallclusters_postprocessed.cpk'%(clustering_path,clustering_path))
def cluster_align_makeTree( path, folders_dict, parallel, disable_cluster_postprocessing, simple_tree):
    """
    create gene clusters as nucleotide/ amino_acid fasta files
    and build individual gene trees based on fna files
    """
    proc= multiprocessing.Process(target=create_geneCluster_fa, args=(path, folders_dict))
    proc.start(); proc.join()

    ## align, build_tree, make_geneTree_json
    cluster_seqs_path = path+'geneCluster/'
    if os.path.exists(cluster_seqs_path+'gene_diversity.txt'):
        os.system('rm '+cluster_seqs_path+'gene_diversity.txt')

    if 0:
        with open(cluster_seqs_path+'cluster_correl_stats.txt', 'wb') as cluster_correl_stats_file:
            cluster_correl_stats_file.write('%s\n'%'\t'.join(
                        ['clusterID', 'random_alnID', 'diversity_nuc', \
                        'mean_seqLen', 'std_seqLen', 'bestSplit_paraNodes', 'bestSplit_branchLen'
                        ]))

    fna_file_list=glob.glob(cluster_seqs_path+"*.fna")
    multips(align_and_makeTree, parallel, fna_file_list,
        cluster_seqs_path, simple_tree)

    ## if cluster_postprocessing skipped, rename allclusters.tsv and allclusters.cpk as the final cluster file
    if disable_cluster_postprocessing:
        update_diversity_cpk(path)
        clustering_path= '%s%s'%(path,'protein_faa/diamond_matches/')
        os.system('cp %sallclusters.tsv %sallclusters_final.tsv'%(clustering_path,clustering_path))
        os.system('cp %sallclusters.cpk %sallclusters_postprocessed.cpk'%(clustering_path,clustering_path))
def RNAclusters_align_makeTree(path, folders_dict, parallel, simple_tree):
    """
    create RNA clusters as nucleotide fasta files
    and build individual RNA trees based on fna files
    """

    diamond_RNACluster_dt = create_RNACluster_fa(path, folders_dict)

    ## align, build_tree, make_RNATree_json
    fasta_path = path + 'geneCluster/'
    fa_files = glob.glob(fasta_path + "*RC*.fna")
    multips(single_RNACluster_align_and_makeTree, parallel, fa_files,
            fasta_path, simple_tree)
    ## add RNA cluster in diamond_geneCluster_dt
    ### load gene cluster
    geneClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/')
    os.system(
        'cp %sallclusters_postprocessed.cpk %s/allclusters_postprocessed.cpk.bk '
        % (geneClusterPath, geneClusterPath))
    diamond_geneCluster_dt = load_pickle(geneClusterPath +
                                         'allclusters_postprocessed.cpk')
    ### update gene cluster with RNA cluster
    update_gene_cluster_with_RNA(path, diamond_RNACluster_dt,
                                 diamond_geneCluster_dt)
    ### update diversity file
    update_diversity_cpk(path)
def cut_all_trees_from_merged_clusters(parallel, path, cut_branch_threshold,
                                       simple_tree):
    """
    split tree from the unclustered genes and create new cluster files
    params:
        gene_list: lists containing the genes in the new split clusters
    """
    geneCluster_fasta_path = '%s%s' % (path, 'geneCluster/')
    merged_cluster_filelist = glob.glob(geneCluster_fasta_path + 'GC_un*.fna')
    ## parallelization of "post-clustering workflow for merged unclustered records"
    multips(cutTree_outputCluster,
            parallel,
            merged_cluster_filelist,
            geneCluster_fasta_path,
            cut_branch_threshold,
            treefile_used=False)

    ## gather new clusters from new_clusters_longSplit.txt
    #if os.path.exists(''.join([geneCluster_fasta_path,'new_clusters_longSplit.txt'])):
    with open(''.join([geneCluster_fasta_path, 'new_clusters_longSplit.txt']),
              'rb') as new_clusters_longSplit:
        new_fa_files_list = [clus.rstrip() for clus in new_clusters_longSplit]

    ## parallelization of "align and make tree on new cluster"
    multips(align_and_makeTree, parallel, new_fa_files_list,
            geneCluster_fasta_path, simple_tree)
def cut_all_trees_from_merged_clusters(parallel, path, cut_branch_threshold, simple_tree):
    """
    split tree from the unclustered genes and create new cluster files
    params:
        gene_list: lists containing the genes in the new split clusters
    """
    geneCluster_fasta_path='%s%s'%(path,'geneCluster/')
    merged_cluster_filelist=glob.glob(geneCluster_fasta_path+'GC_un*.fna')
    ## parallelization of "post-clustering workflow for merged unclustered records"
    multips(cutTree_outputCluster, parallel, merged_cluster_filelist, geneCluster_fasta_path,
        cut_branch_threshold, treefile_used=False)

    ## gather new clusters from new_clusters_longSplit.txt
    #if os.path.exists(''.join([geneCluster_fasta_path,'new_clusters_longSplit.txt'])):
    with open(''.join([geneCluster_fasta_path,'new_clusters_longSplit.txt']),'rb') as new_clusters_longSplit:
        new_fa_files_list=[ clus.rstrip() for clus in new_clusters_longSplit ]

    ## parallelization of "align and make tree on new cluster"
    multips(align_and_makeTree, parallel, new_fa_files_list, geneCluster_fasta_path, simple_tree)
def RNAclusters_align_makeTree( path, folders_dict, parallel, simple_tree ):
    """
    create RNA clusters as nucleotide fasta files
    and build individual RNA trees based on fna files
    """

    diamond_RNACluster_dt=create_RNACluster_fa(path,folders_dict)

    ## align, build_tree, make_RNATree_json
    fasta_path = path+'geneCluster/'
    fa_files=glob.glob(fasta_path+"*RC*.fna")
    multips(single_RNACluster_align_and_makeTree, parallel, fa_files, fasta_path, simple_tree)
    ## add RNA cluster in diamond_geneCluster_dt
    ### load gene cluster
    geneClusterPath='%s%s'%(path,'protein_faa/diamond_matches/')
    os.system('cp %sallclusters_postprocessed.cpk %s/allclusters_postprocessed.cpk.bk '%(geneClusterPath,geneClusterPath))
    diamond_geneCluster_dt=load_pickle(geneClusterPath+'allclusters_postprocessed.cpk')
    ### update gene cluster with RNA cluster
    update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt)
    ### update diversity file
    update_diversity_cpk(path)
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel, core_cutoff, factor_core_diversity, species):
    """
    estimate core gene diversity before gene cluster alignment
    and cluster post-processing
    """
    totalStrain= len(strain_list)

    ## load clusters
    clustering_path= folders_dict['clustering_path']
    geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk')
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']
    protein_dict_path= '%s%s'%(protein_path,'all_protein_seq.cpk')
    nucleotide_dict_path= '%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk')
    tmp_core_seq_path= '%s%s'%(clustering_path,'tmp_core/')
    ## load geneID_to_geneSeqID geneSeqID cpk file
    geneID_to_geneSeqID_dict= load_pickle(path+'geneID_to_geneSeqID.cpk')

    ## create core gene list
    core_geneCluster_dt= defaultdict()
    # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
    for clusterID, cluster_stats in geneCluster_dt.iteritems():
        if core_cutoff==1.0:
            strain_core_cutoff=totalStrain
        else:
            strain_core_cutoff=int(totalStrain*core_cutoff)
        ## check whether #genes == #strains and it's a core/soft-core gene
        if cluster_stats[0]==cluster_stats[2] and cluster_stats[0]>=strain_core_cutoff:
            core_geneCluster_dt[clusterID]=cluster_stats
    if os.path.exists(tmp_core_seq_path):
        os.system(''.join(['rm -rf ',tmp_core_seq_path]))
    os.system('mkdir %s'%tmp_core_seq_path)

    ## create dict storing all genes' translation
    if 0:
        gene_aa_dict= defaultdict(dict)
        for accession_id in strain_list:
            gene_aa_dict[accession_id]= read_fasta(''.join([protein_path,accession_id,'.faa']))
        write_pickle(protein_dict_path, gene_aa_dict)

        ## create dict for all gene's nucleotide sequence
        gene_na_dict= defaultdict(dict)
        for accession_id in strain_list:
            gene_na_dict[accession_id]=read_fasta(''.join([nucleotide_path,accession_id,'.fna']))
        write_pickle(nucleotide_dict_path, gene_na_dict)

    gene_aa_dict= load_pickle(protein_dict_path)
    gene_na_dict= load_pickle(nucleotide_dict_path)

    ## write nucleotide and amino-acid sequences for each gene cluster
    export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt, geneID_to_geneSeqID_dict,
        gene_na_dict, gene_aa_dict)

    tmp_fa_files=glob.glob(tmp_core_seq_path+"*.fna")
    multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path, species)

    calculated_core_diversity=tmp_average_core_diversity(tmp_core_seq_path)
    refined_core_diversity= round((0.1+factor_core_diversity*calculated_core_diversity)/(1+factor_core_diversity*calculated_core_diversity),4)
    print('factor used: '+str(factor_core_diversity))
    print('average core genome diversity: '+str(calculated_core_diversity))
    print('defined core genome diversity cutoff for splitting long branches: '+str(refined_core_diversity))

    ## move folder tmp_core to the central data folder
    new_clustering_path= '%stmp_core'%path
    if os.path.exists(new_clustering_path):
        os.system(''.join(['rm -r ',new_clustering_path]))
    os.system('mv %s %s'%(tmp_core_seq_path, path))
    return calculated_core_diversity, refined_core_diversity
def postprocess_paralogs(parallel,
                         path,
                         nstrains,
                         simple_tree,
                         geneCluster_dt,
                         new_fa_files_set,
                         paralog_branch_cutoff,
                         paralog_frac_cutoff=0.3,
                         plot=0):
    """
    splitting paralogs, discarding old gene clusters and creating new clusters of split paralogs
    params:
        parallel: number of threads to use
        nstrains: total number of strains
        paralog_branch_cutoff: branch length to split (E.g.: core gene diversity as cutoff)
        paralog_frac_cutoff:  fraction of nstrains required for splitting
        plot:      save figure with paralog statistics
    """

    ## exploring paralogs, default: False (not explore and plot), otherwise figure with statistics will saved
    if plot == 1:
        explore_paralogs(path,
                         nstrains,
                         paralog_branch_cutoff=paralog_branch_cutoff,
                         paralog_frac_cutoff=paralog_frac_cutoff,
                         plot=plot)

    clusters_fpath = path + 'geneCluster/'

    if len(new_fa_files_set) == 0:
        fname_list = glob.iglob(clusters_fpath + '*nwk')
    else:
        fname_list = [
            new_fa.replace('.fna', '.nwk') for new_fa in new_fa_files_set
            if os.path.exists(new_fa.replace('.fna', '.nwk'))
        ]

    new_fa_files_set = set()
    n_split_clusters = 0

    for fname in fname_list:
        try:
            tree = Phylo.read(fname, 'newick')
        except:
            print 'debug(postprocess_paralogs read nwk file): ', fname, ' ', os.getcwd(
            )

        best_split = find_best_split(tree)

        if best_split is not None:
            do_split = split_cluster(
                tree,
                nstrains,
                max_branch_length=paralog_branch_cutoff,
                #max_branch_length = paralog_branch_cutoff*mean_branch_length,
                max_paralogs=paralog_frac_cutoff * nstrains)
            if do_split:
                # print 'will split:', fname,' #leaves:', tree.count_terminals(),\
                #     ' #best_split.para_nodes:',len(best_split.para_nodes),\
                #     ' #best_split.split_bl:', best_split.split_bl

                all_genes = set([n.name for n in tree.get_terminals()])
                gene_list1 = set([n.name for n in best_split.get_terminals()])
                gene_list2 = all_genes.difference(gene_list1)
                #print all_genes, gene_list1, gene_list2

                new_fa_files = create_split_cluster_files(
                    clusters_fpath, fname, gene_list1, gene_list2,
                    geneCluster_dt)
                new_fa_files_set |= new_fa_files
                n_split_clusters += 1

    fname_list_len = len(fname_list) if type(fname_list) is list else len(
        list(fname_list))
    #print '#new_split_fasta_files:', fname_list_len, time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), [ new_fa.split('/')[-1] for new_fa in new_fa_files_set ]

    ## make new aln and tree
    #mem_check('multips(align_and_')
    multips(align_and_makeTree, parallel, list(new_fa_files_set),
            clusters_fpath, simple_tree)
    return n_split_clusters, new_fa_files_set
def postprocess_split_long_branch(parallel,
                                  path,
                                  simple_tree,
                                  cut_branch_threshold=0.3):
    """
    Split tree via breaking up long branches.
    Remote homology leads to over-clustering. This yields tree with long branches.
    """

    file_path = ''.join([path, 'geneCluster/'])
    new_split_folder = ''.join([file_path, 'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ', new_split_folder]))
    os.system(''.join(['mkdir ', new_split_folder]))
    deleted_clusters_folder = ''.join(
        [file_path, 'deleted_clusters_longSplit/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ', deleted_clusters_folder]))
    os.system(''.join(['mkdir ', deleted_clusters_folder]))

    ## load clusters
    cluster_path = '%s%s' % (path, 'protein_faa/diamond_matches/')
    geneCluster_dt = load_pickle(cluster_path + 'allclusters.cpk')

    ## gather all trees generated before postprocessing
    tree_path = file_path
    tree_fname_list = glob.glob(tree_path + '*nwk')

    ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
    if os.path.exists(''.join([file_path, 'new_clusters_longSplit.txt'])):
        os.system(''.join(['rm ', file_path, 'new_clusters_longSplit.txt']))
    if os.path.exists(''.join([file_path, 'old_clusters_longSplit.txt'])):
        os.system(''.join(['rm ', file_path, 'old_clusters_longSplit.txt']))

    # =============================================
    # parallelization:
    # "post-clustering workflow for splitting trees on over-clustered records"
    treefile_used = True
    multips(cutTree_outputCluster, parallel, tree_fname_list, file_path,
            cut_branch_threshold, treefile_used)

    ## If new_clusters_longSplit.txt (over_split records) exists,
    ## then gather new clusters from new_clusters_longSplit.txt
    if os.path.exists(''.join([file_path, 'new_clusters_longSplit.txt'])):
        with open(file_path + 'new_clusters_longSplit.txt',
                  'rb') as new_clusters_longSplit:
            new_fa_files_list = [
                clus.rstrip() for clus in new_clusters_longSplit
            ]
            print '#times of splitting long branches:', len(
                new_fa_files_list) - 1
        with open(file_path + 'old_clusters_longSplit.txt',
                  'rb') as delete_cluster_file:
            deleted_file_count = len([clus for clus in delete_cluster_file])
            print '#clusters split during the checking of long branches:', deleted_file_count

        ## parallelization of "align and make tree on new cluster"
        multips(align_and_makeTree, parallel, new_fa_files_list, file_path,
                simple_tree)
        # =============================================

        ## delete original clusters which are split
        delete_original_clusters(file_path, geneCluster_dt)
        ## add newly split clusters
        update_geneCluster_dt(path, geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path, geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)

        os.system(' '.join([
            'mv ', file_path + 'new_clusters_longSplit.txt',
            file_path + 'added_clusters_split_long.txt'
        ]))
        os.system(' '.join([
            'mv ', file_path + 'old_clusters_longSplit.txt',
            file_path + 'deleted_clusters_split_long.txt'
        ]))
    else:  # no clusters postprocessed
        os.system(' '.join([
            'cp', cluster_path + 'allclusters.cpk',
            cluster_path + 'allclusters_postprocessed.cpk'
        ]))
def postprocess_paralogs(parallel, path, nstrains, simple_tree, geneCluster_dt,
    new_fa_files_set,  paralog_branch_cutoff, paralog_frac_cutoff=0.3, plot=0):
    """
    splitting paralogs, discarding old gene clusters and creating new clusters of split paralogs
    params:
        parallel: number of threads to use
        nstrains: total number of strains
        paralog_branch_cutoff: branch length to split (E.g.: core gene diversity as cutoff)
        paralog_frac_cutoff:  fraction of nstrains required for splitting
        plot:      save figure with paralog statistics
    """

    ## exploring paralogs, default: False (not explore and plot), otherwise figure with statistics will saved
    if plot==1:
        explore_paralogs(path, nstrains, paralog_branch_cutoff=paralog_branch_cutoff,
                         paralog_frac_cutoff=paralog_frac_cutoff, plot=plot)

    clusters_fpath = path+'geneCluster/'

    if len(new_fa_files_set)==0:
        fname_list =glob.iglob(clusters_fpath+'*nwk')
    else:
        fname_list = [ new_fa.replace('.fna','.nwk') for new_fa in new_fa_files_set if os.path.exists(new_fa.replace('.fna','.nwk')) ]

    new_fa_files_set= set()
    n_split_clusters = 0

    for fname in fname_list:
        try:
            tree = Phylo.read(fname, 'newick')
        except:
            print 'debug(postprocess_paralogs read nwk file): ',fname, ' ', os.getcwd()

        best_split = find_best_split(tree)

        if best_split is not None:
            do_split = split_cluster(tree, nstrains,
                                     max_branch_length = paralog_branch_cutoff,
                                     #max_branch_length = paralog_branch_cutoff*mean_branch_length,
                                     max_paralogs = paralog_frac_cutoff*nstrains)
            if do_split:
                # print 'will split:', fname,' #leaves:', tree.count_terminals(),\
                #     ' #best_split.para_nodes:',len(best_split.para_nodes),\
                #     ' #best_split.split_bl:', best_split.split_bl

                all_genes = set([n.name for n in tree.get_terminals()])
                gene_list1 = set([n.name for n in best_split.get_terminals()])
                gene_list2 = all_genes.difference(gene_list1)
                #print all_genes, gene_list1, gene_list2

                new_fa_files = create_split_cluster_files(clusters_fpath, fname, gene_list1, gene_list2, geneCluster_dt)
                new_fa_files_set |= new_fa_files
                n_split_clusters+=1

    fname_list_len=len(fname_list) if type(fname_list) is list else len(list(fname_list))
    #print '#new_split_fasta_files:', fname_list_len, time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), [ new_fa.split('/')[-1] for new_fa in new_fa_files_set ]

    ## make new aln and tree
    #mem_check('multips(align_and_')
    multips(align_and_makeTree, parallel, list(new_fa_files_set), clusters_fpath, simple_tree)
    return n_split_clusters, new_fa_files_set
def postprocess_split_long_branch(parallel, path, simple_tree, cut_branch_threshold=0.3):
    """
    Split tree via breaking up long branches.
    Remote homology leads to over-clustering. This yields tree with long branches.
    """

    file_path = ''.join([path,'geneCluster/'])
    new_split_folder= ''.join([file_path,'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ',new_split_folder]))
    os.system(''.join(['mkdir ',new_split_folder]))
    deleted_clusters_folder=''.join([file_path,'deleted_clusters_longSplit/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ',deleted_clusters_folder]))
    os.system(''.join(['mkdir ',deleted_clusters_folder]))

    ## load clusters
    cluster_path='%s%s'%(path,'protein_faa/diamond_matches/')
    geneCluster_dt=load_pickle(cluster_path+'allclusters.cpk')

    ## gather all trees generated before postprocessing
    tree_path = file_path
    tree_fname_list =glob.glob(tree_path+'*nwk')

    ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
    if os.path.exists(''.join([file_path,'new_clusters_longSplit.txt'])):
        os.system(''.join(['rm ',file_path,'new_clusters_longSplit.txt']))
    if os.path.exists(''.join([file_path,'old_clusters_longSplit.txt'])):
        os.system(''.join(['rm ',file_path,'old_clusters_longSplit.txt']))

    # =============================================
    # parallelization:
    # "post-clustering workflow for splitting trees on over-clustered records"
    treefile_used=True
    multips(cutTree_outputCluster, parallel, tree_fname_list, file_path, cut_branch_threshold, treefile_used)

    ## If new_clusters_longSplit.txt (over_split records) exists,
    ## then gather new clusters from new_clusters_longSplit.txt
    if os.path.exists(''.join([file_path,'new_clusters_longSplit.txt'])):
        with open(file_path+'new_clusters_longSplit.txt', 'rb') as new_clusters_longSplit:
            new_fa_files_list=[ clus.rstrip() for clus in new_clusters_longSplit ]
            print '#times of splitting long branches:',len(new_fa_files_list)-1
        with open(file_path+'old_clusters_longSplit.txt', 'rb') as delete_cluster_file:
            deleted_file_count=len([ clus for clus in delete_cluster_file ])
            print '#clusters split during the checking of long branches:',deleted_file_count

        ## parallelization of "align and make tree on new cluster"
        multips(align_and_makeTree, parallel, new_fa_files_list, file_path, simple_tree)
        # =============================================

        ## delete original clusters which are split
        delete_original_clusters(file_path, geneCluster_dt)
        ## add newly split clusters
        update_geneCluster_dt(path,geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path, geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)

        os.system(' '.join(['mv ',file_path+'new_clusters_longSplit.txt' ,file_path+'added_clusters_split_long.txt' ]))
        os.system(' '.join(['mv ',file_path+'old_clusters_longSplit.txt', file_path+'deleted_clusters_split_long.txt']))
    else: # no clusters postprocessed
        os.system(' '.join(['cp',cluster_path+'allclusters.cpk',cluster_path+'allclusters_postprocessed.cpk']))
def estimate_core_gene_diversity(path, folders_dict, strain_list, parallel,
                                 core_cutoff, factor_core_diversity, species):
    """
    estimate core gene diversity before gene cluster alignment
    and cluster post-processing
    """
    totalStrain = len(strain_list)

    ## load clusters
    clustering_path = folders_dict['clustering_path']
    geneCluster_dt = load_pickle(clustering_path + 'allclusters.cpk')
    protein_path = folders_dict['protein_path']
    nucleotide_path = folders_dict['nucleotide_path']
    protein_dict_path = '%s%s' % (protein_path, 'all_protein_seq.cpk')
    nucleotide_dict_path = '%s%s' % (nucleotide_path, 'all_nucleotide_seq.cpk')
    tmp_core_seq_path = '%s%s' % (clustering_path, 'tmp_core/')
    ## load geneID_to_geneSeqID geneSeqID cpk file
    geneID_to_geneSeqID_dict = load_pickle(path + 'geneID_to_geneSeqID.cpk')

    ## create core gene list
    core_geneCluster_dt = defaultdict()
    # geneCluster_dt: {clusterID:[ count_strains,[memb1,...],count_genes }
    for clusterID, cluster_stats in geneCluster_dt.iteritems():
        if core_cutoff == 1.0:
            strain_core_cutoff = totalStrain
        else:
            strain_core_cutoff = int(totalStrain * core_cutoff)
        ## check whether #genes == #strains and it's a core/soft-core gene
        if cluster_stats[0] == cluster_stats[
                2] and cluster_stats[0] >= strain_core_cutoff:
            core_geneCluster_dt[clusterID] = cluster_stats
    if os.path.exists(tmp_core_seq_path):
        os.system(''.join(['rm -rf ', tmp_core_seq_path]))
    os.system('mkdir %s' % tmp_core_seq_path)

    ## create dict storing all genes' translation
    if 0:
        gene_aa_dict = defaultdict(dict)
        for accession_id in strain_list:
            gene_aa_dict[accession_id] = read_fasta(''.join(
                [protein_path, accession_id, '.faa']))
        write_pickle(protein_dict_path, gene_aa_dict)

        ## create dict for all gene's nucleotide sequence
        gene_na_dict = defaultdict(dict)
        for accession_id in strain_list:
            gene_na_dict[accession_id] = read_fasta(''.join(
                [nucleotide_path, accession_id, '.fna']))
        write_pickle(nucleotide_dict_path, gene_na_dict)

    gene_aa_dict = load_pickle(protein_dict_path)
    gene_na_dict = load_pickle(nucleotide_dict_path)

    ## write nucleotide and amino-acid sequences for each gene cluster
    export_cluster_seq_tmp(tmp_core_seq_path, core_geneCluster_dt,
                           geneID_to_geneSeqID_dict, gene_na_dict,
                           gene_aa_dict)

    tmp_fa_files = glob.glob(tmp_core_seq_path + "*.fna")
    multips(calculate_diversity, parallel, tmp_fa_files, tmp_core_seq_path,
            species)

    calculated_core_diversity = tmp_average_core_diversity(tmp_core_seq_path)
    refined_core_diversity = round(
        (0.1 + factor_core_diversity * calculated_core_diversity) /
        (1 + factor_core_diversity * calculated_core_diversity), 4)
    print('factor used: ' + str(factor_core_diversity))
    print('average core genome diversity: ' + str(calculated_core_diversity))
    print(
        'defined core genome diversity cutoff for splitting long branches: ' +
        str(refined_core_diversity))

    ## move folder tmp_core to the central data folder
    new_clustering_path = '%stmp_core' % path
    if os.path.exists(new_clustering_path):
        os.system(''.join(['rm -r ', new_clustering_path]))
    os.system('mv %s %s' % (tmp_core_seq_path, path))
    return calculated_core_diversity, refined_core_diversity