Beispiel #1
0
def aln_to_Newick(path, folders_dict, raxml_timelimit, raxml_path, threads):
    """ function: build core gene SNP tree using SNP alignment
        input: SNP_whole_matrix.aln
        output: strain_tree.nwk
    """
    cluster_seq_path=folders_dict['cluster_seq_path']
    log_path=folders_dict['log_path']
    output_path = '_'.join([cluster_seq_path+'temp_coretree', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))])
    os.system('mkdir %s'%output_path)
    SNP_matrix_path=cluster_seq_path+'SNP_whole_matrix.aln'
    cwd = os.getcwd()
    os.chdir(output_path)

    ## run fasttree
    start = time.time();

    fasttree_program= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    os.system(fasttree_program+' -gtr -nt -gamma -nosupport -mlacc 2 -slownni '+SNP_matrix_path+' > initial_tree.newick0 2> '+log_path+'fasttree.log') ;
    print ' fasttree time-cost:', times(start)

    resolve_polytomies('initial_tree.newick0','initial_tree.newick')

    ## run raxml
    start = time.time();
    out_fname = "tree_infer.newick"
    if raxml_timelimit>0:
        print '%s%d%s'%('RAxML tree optimization within the timelimit of ',raxml_timelimit, ' minutes')
        # exec for killing process
        end_time = time.time() + int(raxml_timelimit*60) #

        raxml_program= 'raxml' if check_dependency('raxml') else 'raxmlHPC'
        process = subprocess.Popen('exec '+raxml_program+' -f d -T '+str(threads)+' -j -s '+SNP_matrix_path+' -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick > '+log_path+'raxml.log', shell=True)
        while (time.time() < end_time):
            if os.path.isfile('RAxML_result.topology'):
                break
            time.sleep(10)
        process.terminate()

        checkpoint_files = glob.glob('RAxML_checkpoint*')
        if os.path.isfile('RAxML_result.topology'):
            checkpoint_files.append('RAxML_result.topology')
        if len(checkpoint_files) > 0:
            last_tree_file = checkpoint_files[-1]
            shutil.copy(last_tree_file, 'raxml_tree.newick')
        else:
            shutil.copy('initial_tree.newick', 'raxml_tree.newick')
    else:
        shutil.copy('initial_tree.newick', 'raxml_tree.newick')

    print 'RAxML branch length optimization and rooting'
    os.system(raxml_program+' -f e -T '+str(threads)+' -s '+SNP_matrix_path+' -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick > '+log_path+'raxml.log')
    shutil.copy('RAxML_result.branches', out_fname)

    print ' raxml time-cost:', times(start)
    midpointRooting(out_fname,'strain_tree.nwk')
    shutil.copy('strain_tree.nwk', cluster_seq_path+'strain_tree.nwk')
    os.chdir(cwd)
    os.system('rm -r %s'%output_path)
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path, simple_tree):
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for RNA_cluster_nu_filename in fa_files_list:
        try:
            # extract GC_RNA002 from path/GC_RNA002.aln
            clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0]
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(RNA_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na.aln
                RNA_cluster_nu_aln_filename= RNA_cluster_nu_filename.replace('.fna','_na.aln')
                ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(RNA_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(RNA_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))
            else: # align and build tree
                print RNA_cluster_nu_filename
                myTree = mpm_tree(RNA_cluster_nu_filename)
                myTree.align()

                if simple_tree==False:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine(CDS=False)
                else:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path, RNA_specific=True)

                RNA_diversity_values='{0:.3f}'.format(myTree.diversity_nuc)
                geneDiversity_file.write('%s\t%s\n'%(clusterID,RNA_diversity_values))
                print clusterID,RNA_diversity_values
        except:
            print("Aligning and tree building of RNA %s failed"%RNA_cluster_nu_filename)
def cutTree_outputCluster( file_list, file_path, cut_branch_threshold, treefile_used):
    """
    process flow for parallelization to cut the tree and output the clades in new clusters
    """
    new_fa_files=set()
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for input_filepath in file_list:
        if treefile_used==True:
            ## read tree
            input_cluster_filename=input_filepath.split('/')[-1].replace('.nwk','.fna')
            try:
                tree= Phylo.read(input_filepath, 'newick')
            except:
                print 'reading tree failed: ',input_filepath
        else:
            ## make tree
            input_cluster_filename=input_filepath.split('/')[-1]
            tree= quick_align_makeTree(input_filepath,fasttree_name)

        ## attempt to cut the tree
        gene_list, rest_genes = cut_tree_gather_clades(tree,cut_branch_threshold)

        ## add to-be-deleted cluster records
        if len(gene_list)!=0 and '_r' not in input_cluster_filename:
            ## 1st check: original cluster has been split
            ## 2nd check: it's not a "further-split" cluster
            ##            from an already split cluster
            with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file:
                #print 'delete clusters that have been split: ',input_cluster_filename
                delete_cluster_file.write('%s\n'%input_cluster_filename)

        ## output cutted clusters
        if len(gene_list)==0:
            ## nothing can be further cutted,
            ## cutting process for current tree will stop.
            if '_r' not in input_cluster_filename:
                ## a tree does not need to be split, skip the following
                pass#continue
            else:
                ## this's a list of rest genes which cannot be split.
                ## fill gene_list with genes in rest_genes
                gene_list=rest_genes
                ## set the rest_genes to empty list
                rest_genes=[]
        else:
            ## further process on left-over genes
            if len(rest_genes)!=0:
                output_cutted_clusters(file_path, input_cluster_filename,
                                    rest_genes, cut_branch_threshold,
                                    treefile_used=False, cut_leftover=True)

        ## write clades in gene_list into clusters
        output_cutted_clusters(file_path, input_cluster_filename,
                            gene_list, cut_branch_threshold,
                            treefile_used=False, cut_leftover=False)
def align_and_makeTree( fna_file_list, alignFile_path, simple_tree):
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for gene_cluster_nu_filename in fna_file_list:
        try:
            # extract GC_00002 from path/GC_00002.aln
            clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0]
            start = time.time();
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na_aln.fa
                gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_nu_aln_filename,gene_cluster_nu_aln_filename.replace('_aln','_aln_reduced')]) )

                ## aa_aln.fa
                gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa')
                gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_aa_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_aa_aln_filename,gene_cluster_aa_aln_filename.replace('_aln','_aln_reduced')]) )

                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))
            else: # align and build tree
                #print gene_cluster_nu_filename
                myTree = mpm_tree(gene_cluster_nu_filename)
                myTree.codon_align()
                myTree.translate()
                if simple_tree==False:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine()
                else:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path)
                #myTree.diversity_statistics_aa()
                #random_alnID=myTree.seqs.keys()[0].split('-')[0]
                diversity_nuc= round(myTree.diversity_nuc,3)#diversity_aa=round(myTree.diversity_aa,3)
                #bestSplit_paraNodes,bestSplit_branchLen = myTree.paralogy_statistics()
                #mean_seqLen, std_seqLen=  myTree.mean_std_seqLen()
                #mean_seqLen, std_seqLen= [ round(i,3) for i in mean_seqLen, std_seqLen ]
                geneDiversity_file.write('%s\t%s\n'%(clusterID,diversity_nuc))
                if 0:
                    cluster_correl_stats_file = open(alignFile_path+'cluster_correl_stats.txt', 'a')
                    cluster_correl_stats_file.write('%s\n'%'\t'.join([
                     str(i) for i in [clusterID, random_alnID, diversity_nuc, \
                        mean_seqLen, std_seqLen, bestSplit_paraNodes, bestSplit_branchLen ] ]))
        except:
            print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
def align_and_makeTree( fna_file_list, alignFile_path, simple_tree):
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for gene_cluster_nu_filename in fna_file_list:
        try:
            # extract GC_00002 from path/GC_00002.aln
            clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0]
            start = time.time();
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na_aln.fa
                gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_nu_aln_filename,gene_cluster_nu_aln_filename.replace('_aln','_aln_reduced')]) )

                ## aa_aln.fa
                gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa')
                gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_aa_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_aa_aln_filename,gene_cluster_aa_aln_filename.replace('_aln','_aln_reduced')]) )

                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))
            else: # align and build tree
                #print gene_cluster_nu_filename
                myTree = mpm_tree(gene_cluster_nu_filename)
                myTree.codon_align()
                myTree.translate()
                if simple_tree==False:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine()
                else:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path)
                #myTree.diversity_statistics_aa()
                #random_alnID=myTree.seqs.keys()[0].split('-')[0]
                diversity_nuc= round(myTree.diversity_nuc,3)#diversity_aa=round(myTree.diversity_aa,3)
                #bestSplit_paraNodes,bestSplit_branchLen = myTree.paralogy_statistics()
                #mean_seqLen, std_seqLen=  myTree.mean_std_seqLen()
                #mean_seqLen, std_seqLen= [ round(i,3) for i in mean_seqLen, std_seqLen ]
                geneDiversity_file.write('%s\t%s\n'%(clusterID,diversity_nuc))
                if 0:
                    cluster_correl_stats_file = open(alignFile_path+'cluster_correl_stats.txt', 'a')
                    cluster_correl_stats_file.write('%s\n'%'\t'.join([
                     str(i) for i in [clusterID, random_alnID, diversity_nuc, \
                        mean_seqLen, std_seqLen, bestSplit_paraNodes, bestSplit_branchLen ] ]))
        except:
            print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path,
                                         simple_tree):
    fasttree_name = 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for RNA_cluster_nu_filename in fa_files_list:
        if 1:  #try:
            # extract GC_RNA002 from path/GC_RNA002.aln
            clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0]
            geneDiversity_file = open(alignFile_path + 'gene_diversity.txt',
                                      'a')
            if len(read_fasta(RNA_cluster_nu_filename)
                   ) == 1:  # nothing to do for singletons
                ## na.aln
                RNA_cluster_nu_aln_filename = RNA_cluster_nu_filename.replace(
                    '.fna', '_na.aln')
                ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(RNA_cluster_nu_aln_filename, 'wb') as write_file:
                    for SeqID, Sequence in read_fasta(
                            RNA_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|', '-'),
                                    Sequence)
                geneDiversity_file.write('%s\t%s\n' % (clusterID, '0.0'))
            else:  # align and build tree
                print RNA_cluster_nu_filename
                myTree = mpm_tree(RNA_cluster_nu_filename)
                myTree.align()

                if simple_tree == False:
                    myTree.build(raxml=False,
                                 fasttree_program=fasttree_name,
                                 treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine()
                else:
                    myTree.build(raxml=False,
                                 fasttree_program=fasttree_name,
                                 treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path, RNA_specific=True)

                RNA_diversity_values = '{0:.3f}'.format(myTree.diversity_nuc)
                geneDiversity_file.write('%s\t%s\n' %
                                         (clusterID, RNA_diversity_values))
                print clusterID, RNA_diversity_values
        if 0:  #except:
            print("Aligning and tree building of RNA %s failed" %
                  RNA_cluster_nu_filename)
Beispiel #7
0
params = parser.parse_args()
path = os.path.abspath(params.folder_name)+'/'
if params.steps[0]=='all':
    ## run all steps
    params.steps=range(1,12)

print 'Running panX in main folder: %s'%path
#species=params.species_name

programs={'mcl':'mcl', 'mafft':'mafft', 'fasttree':'FastTree', 'raxml':'raxmlHPC'}
if params.diamond_path=='':
    programs['diamond']='diamond'
for program_alias, program_name in programs.items():
    passed=False
    ## check whether program_alias exists (if yes, test passed)
    if check_dependency(program_alias):
        continue
    ## if program_alias does not exist, check whether origin program_name exists
    if check_dependency(program_name):
        continue
    ## if the program is not installed, exit
    if not passed:
        if program_name=='diamond':
            warning='\ndiamond not found:\nplease make sure that diamond is installed '+\
            'and diamond binary file is included in the executable search path (e.g.: /usr/bin/diamond);\n'+\
            'alternatively, one can specify diamond path via the parameter -dmp (e.g.: ./panX.py -dmp /mypath/diamond -fn ...)'
            print warning
        else:
            print 'program '+program_name+' not found, please install it.'
        exit()
def cutTree_outputCluster(file_list, file_path, cut_branch_threshold,
                          treefile_used):
    """
    process flow for parallelization to cut the tree and output the clades in new clusters
    """
    new_fa_files = set()
    fasttree_name = 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for input_filepath in file_list:
        if treefile_used == True:
            ## read tree
            input_cluster_filename = input_filepath.split('/')[-1].replace(
                '.nwk', '.fna')
            try:
                tree = Phylo.read(input_filepath, 'newick')
            except:
                print 'reading tree failed: ', input_filepath
        else:
            ## make tree
            input_cluster_filename = input_filepath.split('/')[-1]
            tree = quick_align_makeTree(input_filepath, fasttree_name)

        ## attempt to cut the tree
        gene_list, rest_genes = cut_tree_gather_clades(tree,
                                                       cut_branch_threshold)

        ## add to-be-deleted cluster records
        if len(gene_list) != 0 and '_r' not in input_cluster_filename:
            ## 1st check: original cluster has been split
            ## 2nd check: it's not a "further-split" cluster
            ##            from an already split cluster
            with open(file_path + 'old_clusters_longSplit.txt',
                      'a') as delete_cluster_file:
                #print 'delete clusters that have been split: ',input_cluster_filename
                delete_cluster_file.write('%s\n' % input_cluster_filename)

        ## output cutted clusters
        if len(gene_list) == 0:
            ## nothing can be further cutted,
            ## cutting process for current tree will stop.
            if '_r' not in input_cluster_filename:
                ## a tree does not need to be split, skip the following
                pass  #continue
            else:
                ## this's a list of rest genes which cannot be split.
                ## fill gene_list with genes in rest_genes
                gene_list = rest_genes
                ## set the rest_genes to empty list
                rest_genes = []
        else:
            ## further process on left-over genes
            if len(rest_genes) != 0:
                output_cutted_clusters(file_path,
                                       input_cluster_filename,
                                       rest_genes,
                                       cut_branch_threshold,
                                       treefile_used=False,
                                       cut_leftover=True)

        ## write clades in gene_list into clusters
        output_cutted_clusters(file_path,
                               input_cluster_filename,
                               gene_list,
                               cut_branch_threshold,
                               treefile_used=False,
                               cut_leftover=False)