def build_representative_cluster(clustering_path, threads, input_prefix):
    """ build representative cluster """
    start = time.time()
    cluster_file = ''.join([clustering_path, input_prefix, '_cluster.output'])
    representative_outputfile = ''.join(
        [clustering_path, input_prefix, '_representative', '.faa'])
    subproblem_seqs_path = '%ssubproblem_cluster_seqs/' % clustering_path
    subproblem_merged_faa = ''.join([clustering_path, input_prefix, '.faa'])
    subproblem_faa_dict = read_fasta(subproblem_merged_faa)
    with open(cluster_file, 'rb') as cluster_input:
        subproblem_geneCluster_dt = defaultdict(list)
        cluster_input_lines = [iline for iline in cluster_input]
        subproblem_geneCluster_dt = {}
        subproblem_run_number = input_prefix.split('subproblem_')[1]
        for gid, iline in enumerate(cluster_input_lines):  #cluster_input
            ## use time to avoid clusterID conflict
            clusterID = "GCs%s_%07d%s" % (subproblem_run_number, gid,
                                          time.strftime('%M%S', time.gmtime()))
            gene_ids = iline.rstrip().split('\t')
            subproblem_geneCluster_dt[clusterID] = gene_ids
            ## representative_seq
            representative_seq = subproblem_faa_dict[gene_ids[0]]
            ## write in representative strain
            with open(representative_outputfile, 'a') as representative_output:
                write_in_fa(representative_output, clusterID,
                            representative_seq)
        ## write subproblem_geneCluster_dt
        write_pickle(''.join([clustering_path, input_prefix, '_dicts.cpk']),
                     subproblem_geneCluster_dt)
    print 'build representative clusters for', input_prefix, ': ', times(
        start), '\n'
コード例 #2
0
def create_RNACluster_fa(path, folders_dict):
    """
        input: '.fna', '_RNA_nuc_dict.cpk', 'allclusters.cpk'
        output: '.aln', 'tree.json', etc
    """
    RNA_path = folders_dict['RNA_path']
    RNA_dict = load_pickle('%s%s' % (RNA_path, 'all_RNA_seq.cpk'))

    ## load RNA cluster cpk file
    diamond_RNACluster_dt = load_pickle(RNA_path + 'allclusters.cpk')

    ## load RNAID_to_RNASeqID RNASeqID cpk file
    RNAID_to_RNASeqID_dict = load_pickle(path + 'RNAID_to_SeqID.cpk')

    ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder)
    fasta_path = path + 'geneCluster/'
    ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs }
    for clusterID, RNA in diamond_RNACluster_dt.iteritems():
        ## RNACluster file name
        RNA_cluster_nu_filename = "%s%s" % (clusterID, '.fna')
        RNA_cluster_nu_write = open(fasta_path + RNA_cluster_nu_filename, 'wb')
        ## write nucleotide/amino_acid sequences into RNACluster files
        for RNA_memb in RNA[1]:
            ## RNA_name format: strain_1|locusTag
            strain_name = RNA_memb.split('|')[0]
            RNA_memb_seq = str(RNA_dict[strain_name][RNA_memb])
            RNASeqID = RNAID_to_RNASeqID_dict[RNA_memb]
            write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq)
        RNA_cluster_nu_write.close()
    return diamond_RNACluster_dt
コード例 #3
0
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path, simple_tree):
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for RNA_cluster_nu_filename in fa_files_list:
        try:
            # extract GC_RNA002 from path/GC_RNA002.aln
            clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0]
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(RNA_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na.aln
                RNA_cluster_nu_aln_filename= RNA_cluster_nu_filename.replace('.fna','_na.aln')
                ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(RNA_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(RNA_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))
            else: # align and build tree
                print RNA_cluster_nu_filename
                myTree = mpm_tree(RNA_cluster_nu_filename)
                myTree.align()

                if simple_tree==False:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine(CDS=False)
                else:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path, RNA_specific=True)

                RNA_diversity_values='{0:.3f}'.format(myTree.diversity_nuc)
                geneDiversity_file.write('%s\t%s\n'%(clusterID,RNA_diversity_values))
                print clusterID,RNA_diversity_values
        except:
            print("Aligning and tree building of RNA %s failed"%RNA_cluster_nu_filename)
コード例 #4
0
def create_geneCluster_fa(path,folders_dict):
    """ dict storing amino_acid Id/Seq from '.faa' files
        input: '.faa', '_gene_nuc_dict.cpk', 'allclusters.cpk'
        output:
    """
    ## make sure the geneCluster folder is empty
    os.system('rm -rf %s'%(path+'geneCluster/'))

    clustering_path= folders_dict['clustering_path']
    geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk')
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']

    geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk')
    gene_aa_dict= load_pickle('%s%s'%(protein_path,'all_protein_seq.cpk'))
    gene_na_dict= load_pickle('%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk'))

    ## create cluster-genes fasta files
    cluster_seqs_path=path+'geneCluster/'
    os.system('mkdir '+cluster_seqs_path)

    ## write nuc/aa sequences for each cluster
    for clusterID, gene in geneCluster_dt.iteritems():
        ## geneCluster file name
        gene_cluster_nu_filename="%s%s"%(clusterID,'.fna')
        gene_cluster_aa_filename="%s%s"%(clusterID,'.faa')
        with open( cluster_seqs_path+gene_cluster_nu_filename, 'wb') as gene_cluster_nu_write, \
            open( cluster_seqs_path+gene_cluster_aa_filename, 'wb') as gene_cluster_aa_write:
            ## write nucleotide/amino_acid sequences into geneCluster files
            for gene_memb in gene[1]:
                ## gene_name format: strain_1|locusTag
                strain_name= gene_memb.split('|')[0]
                geneSeqID=geneID_to_geneSeqID_dict[gene_memb]
                write_in_fa(gene_cluster_nu_write, geneSeqID, gene_na_dict[strain_name][gene_memb] )
                write_in_fa(gene_cluster_aa_write, geneSeqID, gene_aa_dict[strain_name][gene_memb])
コード例 #5
0
def create_geneCluster_fa(path,folders_dict):
    """ dict storing amino_acid Id/Seq from '.faa' files
        input: '.faa', '_gene_nuc_dict.cpk', 'allclusters.cpk'
        output:
    """
    ## make sure the geneCluster folder is empty
    os.system('rm -rf %s'%(path+'geneCluster/'))

    clustering_path= folders_dict['clustering_path']
    geneCluster_dt= load_pickle(clustering_path+'allclusters.cpk')
    protein_path= folders_dict['protein_path']
    nucleotide_path= folders_dict['nucleotide_path']

    geneID_to_geneSeqID_dict=load_pickle(path+'geneID_to_geneSeqID.cpk')
    gene_aa_dict= load_pickle('%s%s'%(protein_path,'all_protein_seq.cpk'))
    gene_na_dict= load_pickle('%s%s'%(nucleotide_path,'all_nucleotide_seq.cpk'))

    ## create cluster-genes fasta files
    cluster_seqs_path=path+'geneCluster/'
    os.system('mkdir '+cluster_seqs_path)

    ## write nuc/aa sequences for each cluster
    for clusterID, gene in geneCluster_dt.iteritems():
        ## geneCluster file name
        gene_cluster_nu_filename="%s%s"%(clusterID,'.fna')
        gene_cluster_aa_filename="%s%s"%(clusterID,'.faa')
        with open( cluster_seqs_path+gene_cluster_nu_filename, 'wb') as gene_cluster_nu_write, \
            open( cluster_seqs_path+gene_cluster_aa_filename, 'wb') as gene_cluster_aa_write:
            ## write nucleotide/amino_acid sequences into geneCluster files
            for gene_memb in gene[1]:
                ## gene_name format: strain_1|locusTag
                strain_name= gene_memb.split('|')[0]
                geneSeqID=geneID_to_geneSeqID_dict[gene_memb]
                write_in_fa(gene_cluster_nu_write, geneSeqID, gene_na_dict[strain_name][gene_memb] )
                write_in_fa(gene_cluster_aa_write, geneSeqID, gene_aa_dict[strain_name][gene_memb])
コード例 #6
0
def create_RNACluster_fa(path,folders_dict):
    """
        input: '.fna', '_RNA_nuc_dict.cpk', 'allclusters.cpk'
        output: '.aln', 'tree.json', etc
    """
    RNA_path= folders_dict['RNA_path']
    RNA_dict= load_pickle('%s%s'%(RNA_path,'all_RNA_seq.cpk'))

    ## load RNA cluster cpk file
    diamond_RNACluster_dt=load_pickle(RNA_path+'allclusters.cpk')

    ## load RNAID_to_RNASeqID RNASeqID cpk file
    RNAID_to_RNASeqID_dict=load_pickle(path+'RNAID_to_SeqID.cpk')

    ## create cluster-RNAs fasta files (By default: put RNAs in geneCluster folder)
    fasta_path=path+'geneCluster/';
    ## diamond_RNACluster_dt: {clusterID:[ count_strains,[memb1,...],count_RNAs }
    for clusterID, RNA in diamond_RNACluster_dt.iteritems():
        ## RNACluster file name
        RNA_cluster_nu_filename="%s%s"%(clusterID,'.fna')
        RNA_cluster_nu_write=open( fasta_path+RNA_cluster_nu_filename, 'wb')
        ## write nucleotide/amino_acid sequences into RNACluster files
        for RNA_memb in RNA[1]:
            ## RNA_name format: strain_1|locusTag
            strain_name= RNA_memb.split('|')[0]
            RNA_memb_seq=str(RNA_dict[strain_name][RNA_memb])
            RNASeqID=RNAID_to_RNASeqID_dict[RNA_memb]
            write_in_fa(RNA_cluster_nu_write, RNASeqID, RNA_memb_seq )
        RNA_cluster_nu_write.close()
    return diamond_RNACluster_dt
コード例 #7
0
def build_representative_cluster(clustering_path, threads, input_prefix):
    """ build representative cluster """
    start = time.time()
    cluster_file= ''.join([clustering_path,input_prefix,'_cluster.output'])
    representative_outputfile= ''.join([clustering_path,input_prefix,'_representative','.faa'])
    subproblem_seqs_path= '%ssubproblem_cluster_seqs/'%clustering_path
    subproblem_merged_faa= ''.join([clustering_path,input_prefix,'.faa'])
    subproblem_faa_dict= read_fasta(subproblem_merged_faa)
    with open(cluster_file, 'rb') as cluster_input:
        subproblem_geneCluster_dt= defaultdict(list)
        cluster_input_lines= [iline for iline in cluster_input]
        subproblem_geneCluster_dt= {}
        subproblem_run_number= input_prefix.split('subproblem_')[1]
        for gid, iline in enumerate(cluster_input_lines):#cluster_input
            ## use time to avoid clusterID conflict
            clusterID= "GCs%s_%07d%s"%(subproblem_run_number, gid, time.strftime('%M%S',time.gmtime()))
            gene_ids= iline.rstrip().split('\t')
            subproblem_geneCluster_dt[clusterID]= gene_ids
            ## representative_seq
            representative_seq=subproblem_faa_dict[gene_ids[0]]
            ## write in representative strain
            with open(representative_outputfile, 'a') as representative_output:
                write_in_fa(representative_output, clusterID, representative_seq)
        ## write subproblem_geneCluster_dt
        write_pickle(''.join([clustering_path,input_prefix,'_dicts.cpk']), subproblem_geneCluster_dt)
    print 'build representative clusters for', input_prefix,': ', times(start), '\n'
コード例 #8
0
def align_and_makeTree( fna_file_list, alignFile_path, simple_tree):
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for gene_cluster_nu_filename in fna_file_list:
        try:
            # extract GC_00002 from path/GC_00002.aln
            clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0]
            start = time.time();
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na_aln.fa
                gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_nu_aln_filename,gene_cluster_nu_aln_filename.replace('_aln','_aln_reduced')]) )

                ## aa_aln.fa
                gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa')
                gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_aa_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_aa_aln_filename,gene_cluster_aa_aln_filename.replace('_aln','_aln_reduced')]) )

                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))
            else: # align and build tree
                #print gene_cluster_nu_filename
                myTree = mpm_tree(gene_cluster_nu_filename)
                myTree.codon_align()
                myTree.translate()
                if simple_tree==False:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine()
                else:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path)
                #myTree.diversity_statistics_aa()
                #random_alnID=myTree.seqs.keys()[0].split('-')[0]
                diversity_nuc= round(myTree.diversity_nuc,3)#diversity_aa=round(myTree.diversity_aa,3)
                #bestSplit_paraNodes,bestSplit_branchLen = myTree.paralogy_statistics()
                #mean_seqLen, std_seqLen=  myTree.mean_std_seqLen()
                #mean_seqLen, std_seqLen= [ round(i,3) for i in mean_seqLen, std_seqLen ]
                geneDiversity_file.write('%s\t%s\n'%(clusterID,diversity_nuc))
                if 0:
                    cluster_correl_stats_file = open(alignFile_path+'cluster_correl_stats.txt', 'a')
                    cluster_correl_stats_file.write('%s\n'%'\t'.join([
                     str(i) for i in [clusterID, random_alnID, diversity_nuc, \
                        mean_seqLen, std_seqLen, bestSplit_paraNodes, bestSplit_branchLen ] ]))
        except:
            print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
コード例 #9
0
def align_and_makeTree( fna_file_list, alignFile_path, simple_tree):
    fasttree_name= 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for gene_cluster_nu_filename in fna_file_list:
        try:
            # extract GC_00002 from path/GC_00002.aln
            clusterID = gene_cluster_nu_filename.split('/')[-1].split('.')[0]
            start = time.time();
            geneDiversity_file = open(alignFile_path+'gene_diversity.txt', 'a')
            if len( read_fasta(gene_cluster_nu_filename) )==1: # nothing to do for singletons
                ## na_aln.fa
                gene_cluster_nu_aln_filename= gene_cluster_nu_filename.replace('.fna','_na_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_nu_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_nu_aln_filename,gene_cluster_nu_aln_filename.replace('_aln','_aln_reduced')]) )

                ## aa_aln.fa
                gene_cluster_aa_filename= gene_cluster_nu_filename.replace('.fna','.faa')
                gene_cluster_aa_aln_filename= gene_cluster_nu_filename.replace('.fna','_aa_aln.fa')
                ## geneSeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(gene_cluster_aa_aln_filename,'wb') as write_file:
                    for SeqID, Sequence in read_fasta(gene_cluster_aa_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|','-'), Sequence)
                os.system( ' '.join(['cp',gene_cluster_aa_aln_filename,gene_cluster_aa_aln_filename.replace('_aln','_aln_reduced')]) )

                geneDiversity_file.write('%s\t%s\n'%(clusterID,'0.0'))
            else: # align and build tree
                #print gene_cluster_nu_filename
                myTree = mpm_tree(gene_cluster_nu_filename)
                myTree.codon_align()
                myTree.translate()
                if simple_tree==False:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine()
                else:
                    myTree.build(raxml=False,fasttree_program=fasttree_name,treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path)
                #myTree.diversity_statistics_aa()
                #random_alnID=myTree.seqs.keys()[0].split('-')[0]
                diversity_nuc= round(myTree.diversity_nuc,3)#diversity_aa=round(myTree.diversity_aa,3)
                #bestSplit_paraNodes,bestSplit_branchLen = myTree.paralogy_statistics()
                #mean_seqLen, std_seqLen=  myTree.mean_std_seqLen()
                #mean_seqLen, std_seqLen= [ round(i,3) for i in mean_seqLen, std_seqLen ]
                geneDiversity_file.write('%s\t%s\n'%(clusterID,diversity_nuc))
                if 0:
                    cluster_correl_stats_file = open(alignFile_path+'cluster_correl_stats.txt', 'a')
                    cluster_correl_stats_file.write('%s\n'%'\t'.join([
                     str(i) for i in [clusterID, random_alnID, diversity_nuc, \
                        mean_seqLen, std_seqLen, bestSplit_paraNodes, bestSplit_branchLen ] ]))
        except:
            print("Aligning and tree building of %s failed"%gene_cluster_nu_filename)
コード例 #10
0
def make_genepresence_alignment(path, disable_gain_loss,
                                merged_gain_loss_output):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/')
    output_path = '%s%s' % (path, 'geneCluster/')

    ## load strain list and prepare for gene presence/absence
    strain_list = load_pickle('%s%s' % (path, 'strain_list.cpk'))
    set_totalStrain = set([istrain for istrain in strain_list])
    totalStrain = len(set_totalStrain)
    dt_strainGene = defaultdict(str)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for clusterID, gene in sorted_genelist:
        ## append 0/1 to each strain
        create_genePresence(dt_strainGene, totalStrain, set_totalStrain,
                            gene[1])

    with open('%s%s' % (output_path, 'genePresence.aln'),
              'wb') as presence_outfile:
        for istkey in dt_strainGene:
            write_in_fa(presence_outfile, istkey, dt_strainGene[istkey])
    write_pickle('%s%s' % (output_path, 'dt_genePresence.cpk'), dt_strainGene)

    if disable_gain_loss:
        geneEvents_dt = {i: 0 for i in range(len(sorted_genelist))}
        write_pickle('%s%s' % (output_path, 'dt_geneEvents.cpk'),
                     geneEvents_dt)
        if merged_gain_loss_output:
            gene_loss_fname = '%s%s' % (output_path, 'geneGainLossEvent.json')
            write_json(dt_strainGene, gene_loss_fname, indent=1)
        else:
            ## strainID as key, presence pattern as value (converted into np.array)
            keylist = dt_strainGene.keys()
            keylist.sort()
            strainID_keymap = {ind: k
                               for ind, k in enumerate(keylist)
                               }  # dict(zip(keylist, range(3)))
            presence_arr = np.array([
                np.array(dt_strainGene[k], 'c') for k in keylist
            ])  # 0: present, 3: absent
            presence_arr[presence_arr == '1'] = '3'
            for ind, (clusterID, gene) in enumerate(sorted_genelist):
                pattern_dt = {
                    strainID_keymap[strain_ind]: str(patt)
                    for strain_ind, patt in enumerate(presence_arr[:, ind])
                }
                pattern_fname = '%s%s_patterns.json' % (output_path, clusterID)
                write_json(pattern_dt, pattern_fname, indent=1)
コード例 #11
0
def export_cluster_seq_tmp(cluster_seqs_path, geneCluster_dt,
    geneID_to_geneSeqID_dict, gene_na_dict, gene_aa_dict):
    """ write nuc/aa sequences for each cluster  """
    for clusterID, gene in geneCluster_dt.iteritems():
        ## geneCluster file name
        gene_cluster_nu_filename="%s%s"%(clusterID,'.fna')
        with open( cluster_seqs_path+gene_cluster_nu_filename, 'wb') as gene_cluster_nu_write:
            ## write nucleotide sequences into geneCluster files
            for gene_memb in gene[1]:
                ## gene_name format: strain_1|locusTag
                strain_name= gene_memb.split('|')[0]
                geneSeqID=geneID_to_geneSeqID_dict[gene_memb]
                write_in_fa(gene_cluster_nu_write, geneSeqID, gene_na_dict[strain_name][gene_memb] )
コード例 #12
0
def concatenate_core_gene_alignments(input_path, output_path):
    core_genes_dt=defaultdict(str)
    with open(input_path+'/geneCluster/core_geneList.txt') as core_list:
    # all core gene alignments in FASTA files
        for gene in core_list:
            gene_path= input_path+'/vis/geneCluster/'+gene.rstrip()+'.gz'
            with gzip.open(gene_path, 'rb') as zip_file:
                for record in SeqIO.parse(zip_file, "fasta"):
                    #NC_018495-CM9_RS00390-1-hypothetical_protein
                    accession=record.id.split('-')[0]
                    core_genes_dt[accession]= '%s%s'%(core_genes_dt[accession], record.seq)

    with open(output_path,'wb') as output_file:
        for gene_id, gene_seq in core_genes_dt.iteritems():
            write_in_fa(output_file, gene_id, gene_seq)
コード例 #13
0
def export_cluster_seq_tmp(cluster_seqs_path, geneCluster_dt,
                           geneID_to_geneSeqID_dict, gene_na_dict,
                           gene_aa_dict):
    """ write nuc/aa sequences for each cluster  """
    for clusterID, gene in geneCluster_dt.iteritems():
        ## geneCluster file name
        gene_cluster_nu_filename = "%s%s" % (clusterID, '.fna')
        with open(cluster_seqs_path + gene_cluster_nu_filename,
                  'wb') as gene_cluster_nu_write:
            ## write nucleotide sequences into geneCluster files
            for gene_memb in gene[1]:
                ## gene_name format: strain_1|locusTag
                strain_name = gene_memb.split('|')[0]
                geneSeqID = geneID_to_geneSeqID_dict[gene_memb]
                write_in_fa(gene_cluster_nu_write, geneSeqID,
                            gene_na_dict[strain_name][gene_memb])
コード例 #14
0
def concatenate_core_gene_alignments(input_path, output_path):
    core_genes_dt = defaultdict(str)
    with open(input_path + '/geneCluster/core_geneList.txt') as core_list:
        # all core gene alignments in FASTA files
        for gene in core_list:
            gene_path = input_path + '/vis/geneCluster/' + gene.rstrip(
            ) + '.gz'
            with gzip.open(gene_path, 'rb') as zip_file:
                for record in SeqIO.parse(zip_file, "fasta"):
                    #NC_018495-CM9_RS00390-1-hypothetical_protein
                    accession = record.id.split('-')[0]
                    core_genes_dt[accession] = '%s%s' % (
                        core_genes_dt[accession], record.seq)

    with open(output_path, 'wb') as output_file:
        for gene_id, gene_seq in core_genes_dt.iteritems():
            write_in_fa(output_file, gene_id, gene_seq)
コード例 #15
0
def single_RNACluster_align_and_makeTree(fa_files_list, alignFile_path,
                                         simple_tree):
    fasttree_name = 'fasttree' if check_dependency('fasttree') else 'FastTree'
    for RNA_cluster_nu_filename in fa_files_list:
        if 1:  #try:
            # extract GC_RNA002 from path/GC_RNA002.aln
            clusterID = RNA_cluster_nu_filename.split('/')[-1].split('.')[0]
            geneDiversity_file = open(alignFile_path + 'gene_diversity.txt',
                                      'a')
            if len(read_fasta(RNA_cluster_nu_filename)
                   ) == 1:  # nothing to do for singletons
                ## na.aln
                RNA_cluster_nu_aln_filename = RNA_cluster_nu_filename.replace(
                    '.fna', '_na.aln')
                ## RNA SeqID separator '|' is replaced by '-' for msa viewer compatibility
                with open(RNA_cluster_nu_aln_filename, 'wb') as write_file:
                    for SeqID, Sequence in read_fasta(
                            RNA_cluster_nu_filename).iteritems():
                        write_in_fa(write_file, SeqID.replace('|', '-'),
                                    Sequence)
                geneDiversity_file.write('%s\t%s\n' % (clusterID, '0.0'))
            else:  # align and build tree
                print RNA_cluster_nu_filename
                myTree = mpm_tree(RNA_cluster_nu_filename)
                myTree.align()

                if simple_tree == False:
                    myTree.build(raxml=False,
                                 fasttree_program=fasttree_name,
                                 treetime_used=True)
                    myTree.ancestral(translate_tree=True)
                    myTree.refine()
                else:
                    myTree.build(raxml=False,
                                 fasttree_program=fasttree_name,
                                 treetime_used=False)
                myTree.diversity_statistics_nuc()
                myTree.export(path=alignFile_path, RNA_specific=True)

                RNA_diversity_values = '{0:.3f}'.format(myTree.diversity_nuc)
                geneDiversity_file.write('%s\t%s\n' %
                                         (clusterID, RNA_diversity_values))
                print clusterID, RNA_diversity_values
        if 0:  #except:
            print("Aligning and tree building of RNA %s failed" %
                  RNA_cluster_nu_filename)
コード例 #16
0
def make_genepresence_alignment(path, disable_gain_loss, merged_gain_loss_output):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath='%s%s'%(path,'protein_fna/diamond_matches/')
    output_path='%s%s'%(path,'geneCluster/');

    ## load strain list and prepare for gene presence/absence
    strain_list= load_pickle('%s%s'%(path,'strain_list.cpk'))
    set_totalStrain=set([ istrain for istrain in strain_list ])
    totalStrain=len(set_totalStrain)
    dt_strainGene= defaultdict(str)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for clusterID, gene in sorted_genelist:
        ## append 0/1 to each strain
        create_genePresence(dt_strainGene, totalStrain, set_totalStrain, gene[1])

    with open('%s%s'%(output_path,'genePresence.aln'),'wb') as presence_outfile:
        for istkey in dt_strainGene:
            write_in_fa( presence_outfile, istkey, dt_strainGene[istkey])
    write_pickle('%s%s'%(output_path,'dt_genePresence.cpk'), dt_strainGene)

    if disable_gain_loss:
        geneEvents_dt={ i:0 for i in range(len(sorted_genelist)) }
        write_pickle('%s%s'%(output_path,'dt_geneEvents.cpk'), geneEvents_dt)
        if merged_gain_loss_output:
            gene_loss_fname='%s%s'%(output_path,'geneGainLossEvent.json')
            write_json(dt_strainGene, gene_loss_fname, indent=1)
        else:
            ## strainID as key, presence pattern as value (converted into np.array)
            keylist= dt_strainGene.keys(); keylist.sort()
            strainID_keymap= {ind:k for ind, k in enumerate(keylist)} # dict(zip(keylist, range(3)))
            presence_arr= np.array([ np.array(dt_strainGene[k],'c') for k in keylist]) # 0: present, 3: absent
            presence_arr[presence_arr=='1']='3'
            for ind, (clusterID, gene) in enumerate(sorted_genelist):
                pattern_dt= { strainID_keymap[strain_ind]:str(patt) for strain_ind, patt in enumerate(presence_arr[:, ind])}
                pattern_fname='%s%s_patterns.json'%(output_path,clusterID)
                write_json(pattern_dt, pattern_fname, indent=1)
コード例 #17
0
def create_core_SNP_matrix(path, core_cutoff=1.0, core_gene_strain_fpath=''):#1.0
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
        core_cutoff: percentage of strains used to decide whether a gene is core
            default: 1.0 (strictly core gene, which is present in all strains)
            customized: 0.9 ( soft core, considered as core if present in 90% of strains)
    """
    import os,sys,operator
    import numpy as np
    import numpy.ma as ma
    from collections import defaultdict
    from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath='%s%s'%(path,'geneCluster/')
    output_path= alnFilePath

    ## create core gene list
    corelist=[]
    strain_list=load_pickle(path+'strain_list.cpk')
    totalStrain= len(strain_list)
    sorted_geneList = load_sorted_clusters(path)
    if core_gene_strain_fpath!='':
        with open(core_gene_strain_fpath,'rb') as core_gene_strain_file:
            core_strain_set= set([i.rstrip().replace('-','_') for i in core_gene_strain_file])
    with open(output_path+'core_geneList.txt','wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if core_cutoff==1.0:
                strain_core_cutoff=totalStrain
            else:
                strain_core_cutoff=int(totalStrain*core_cutoff)
            if vg[0]==vg[2] and vg[0]>=strain_core_cutoff:
                coreGeneName='%s%s'%(clusterID,'_na_aln.fa')
                ## sequences might be discarded because of premature stops
                coreGeneName_path= alnFilePath+coreGeneName
                if os.path.exists(coreGeneName_path) and len(read_fasta(coreGeneName_path)) >= strain_core_cutoff:
                    if core_gene_strain_fpath!='' and len(core_strain_set-set([i.split('|')[0] for i in vg[1]]))!=0:
                        continue
                    outfile.write(coreGeneName+'\n')
                    corelist.append(coreGeneName)
                else:
                    #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene')
                    pass

        write_pickle(output_path+'core_geneList.cpk',corelist)

    refSeqList=load_pickle(path+'strain_list.cpk');refSeqList.sort()

    snp_fre_lst=[]; snp_wh_matrix_flag=0
    snp_pos_dt=defaultdict(list); snp_whole_matrix=np.array([])
    snps_by_gene=[]
    for align_file in corelist:## core genes
        nuc_array=np.array([]) # array to store nucleotides for each gene
        gene_seq_dt=read_fasta(alnFilePath+align_file)
        if core_cutoff!=1.0:
            # set sequences for missing gene (space*gene_length)
            missing_gene_seq=' '*len(gene_seq_dt.values()[0])
            totalStrain_sorted_lst=sorted(strain_list)
        # build strain_seq_dt from gene_seq_dt
        strain_seq_dt=defaultdict()
        for gene, seq in gene_seq_dt.iteritems():
            strain_seq_dt[gene.split('-')[0]]=seq # strain-locus_tag-...
        strain_seq_sorted_lst=sorted(strain_seq_dt.items(), key=lambda x: x[0])

        start_flag=0
        if core_cutoff==1.0:
            for ka, va in strain_seq_sorted_lst:
                if start_flag==0:
                    nuc_array=np.array(np.fromstring(va, dtype='S1'))
                    start_flag=1
                else:
                    nuc_array=np.vstack((nuc_array,np.fromstring(va, dtype='S1')))
            ## find SNP positions
            position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis = 0)
            position_has_gap = np.any(nuc_array=='-', axis=0)
            position_SNP = position_polymorphic&(~position_has_gap)
            snp_columns = nuc_array[:,position_SNP]
            snp_pos_dt[align_file]=np.where(position_SNP)[0]
        else:
        ## add '-' for missing genes when dealing with soft core genes
            core_gene_strain=[ gene for gene in strain_seq_dt.keys()]
            for strain in totalStrain_sorted_lst:
                if start_flag==0:
                    if strain in core_gene_strain:
                        nuc_array=np.array(np.fromstring(strain_seq_dt[strain], dtype='S1'))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file)
                        nuc_array=np.array(np.fromstring(missing_gene_seq, dtype='S1'))
                    start_flag=1
                else:
                    if strain in core_gene_strain:
                        nuc_array=np.vstack((nuc_array,np.fromstring(strain_seq_dt[strain], dtype='S1')))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s'%(strain,align_file)
                        nuc_array=np.vstack((nuc_array,np.fromstring(missing_gene_seq, dtype='S1')))
            ## find SNP positions
            ## mask missing genes -- determine rows that have ' ' in every column
            is_missing = np.all(nuc_array==' ',axis=1)
            masked_non_missing_array= np.ma.masked_array(nuc_array, nuc_array==' ')
            position_polymorphic = np.any(masked_non_missing_array!= masked_non_missing_array[0, :],axis = 0)
            position_has_gap = np.any(masked_non_missing_array=='-',axis=0)
            position_SNP = position_polymorphic&(~position_has_gap)
            # the below seems duplicated from 5 lines above??
            if is_missing.sum()>0: # with missing genes
                nuc_array[is_missing]='-'
            snp_columns = nuc_array[:,position_SNP]
            snp_pos_dt[align_file]=np.where(position_SNP)[0]
            #print snp_columns

        if snp_wh_matrix_flag==0:
            snp_whole_matrix=snp_columns;
            snp_wh_matrix_flag=1
        else:
            snp_whole_matrix=np.hstack((snp_whole_matrix, snp_columns))
    write_pickle(output_path+'snp_pos.cpk',snp_pos_dt)

    with open(output_path+'SNP_whole_matrix.aln','wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa( outfile, refSeqList[ind], isw.tostring() )
コード例 #18
0
def create_core_SNP_matrix(path,
                           core_cutoff=1.0,
                           core_gene_strain_fpath=''):  #1.0
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
        core_cutoff: percentage of strains used to decide whether a gene is core
            default: 1.0 (strictly core gene, which is present in all strains)
            customized: 0.9 ( soft core, considered as core if present in 90% of strains)
    """
    import os, sys, operator
    import numpy as np
    import numpy.ma as ma
    from collections import defaultdict
    from sf_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath = '%s%s' % (path, 'geneCluster/')
    output_path = alnFilePath

    ## create core gene list
    corelist = []
    strain_list = load_pickle(path + 'strain_list.cpk')
    totalStrain = len(strain_list)
    sorted_geneList = load_sorted_clusters(path)
    if core_gene_strain_fpath != '':
        with open(core_gene_strain_fpath, 'rb') as core_gene_strain_file:
            core_strain_set = set(
                [i.rstrip().replace('-', '_') for i in core_gene_strain_file])
    with open(output_path + 'core_geneList.txt', 'wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if core_cutoff == 1.0:
                strain_core_cutoff = totalStrain
            else:
                strain_core_cutoff = int(totalStrain * core_cutoff)
            if vg[0] == vg[2] and vg[0] >= strain_core_cutoff:
                coreGeneName = '%s%s' % (clusterID, '_na_aln.fa')
                ## sequences might be discarded because of premature stops
                coreGeneName_path = alnFilePath + coreGeneName
                if os.path.exists(coreGeneName_path) and len(
                        read_fasta(coreGeneName_path)) >= strain_core_cutoff:
                    if core_gene_strain_fpath != '' and len(
                            core_strain_set -
                            set([i.split('|')[0] for i in vg[1]])) != 0:
                        continue
                    outfile.write(coreGeneName + '\n')
                    corelist.append(coreGeneName)
                else:
                    #print '%s%s%s'%('warning: ',coreGeneName_path,' is not a core gene')
                    pass

        write_pickle(output_path + 'core_geneList.cpk', corelist)

    refSeqList = load_pickle(path + 'strain_list.cpk')
    refSeqList.sort()

    snp_fre_lst = []
    snp_wh_matrix_flag = 0
    snp_pos_dt = defaultdict(list)
    snp_whole_matrix = np.array([])
    snps_by_gene = []
    for align_file in corelist:  ## core genes
        nuc_array = np.array([])  # array to store nucleotides for each gene
        gene_seq_dt = read_fasta(alnFilePath + align_file)
        if core_cutoff != 1.0:
            # set sequences for missing gene (space*gene_length)
            missing_gene_seq = ' ' * len(gene_seq_dt.values()[0])
            totalStrain_sorted_lst = sorted(strain_list)
        # build strain_seq_dt from gene_seq_dt
        strain_seq_dt = defaultdict()
        for gene, seq in gene_seq_dt.iteritems():
            strain_seq_dt[gene.split('-')[0]] = seq  # strain-locus_tag-...
        strain_seq_sorted_lst = sorted(strain_seq_dt.items(),
                                       key=lambda x: x[0])

        start_flag = 0
        if core_cutoff == 1.0:
            for ka, va in strain_seq_sorted_lst:
                if start_flag == 0:
                    nuc_array = np.array(np.fromstring(va, dtype='S1'))
                    start_flag = 1
                else:
                    nuc_array = np.vstack(
                        (nuc_array, np.fromstring(va, dtype='S1')))
            ## find SNP positions
            position_polymorphic = np.any(nuc_array != nuc_array[0, :], axis=0)
            position_has_gap = np.any(nuc_array == '-', axis=0)
            position_SNP = position_polymorphic & (~position_has_gap)
            snp_columns = nuc_array[:, position_SNP]
            snp_pos_dt[align_file] = np.where(position_SNP)[0]
        else:
            ## add '-' for missing genes when dealing with soft core genes
            core_gene_strain = [gene for gene in strain_seq_dt.keys()]
            for strain in totalStrain_sorted_lst:
                if start_flag == 0:
                    if strain in core_gene_strain:
                        nuc_array = np.array(
                            np.fromstring(strain_seq_dt[strain], dtype='S1'))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s' % (
                            strain, align_file)
                        nuc_array = np.array(
                            np.fromstring(missing_gene_seq, dtype='S1'))
                    start_flag = 1
                else:
                    if strain in core_gene_strain:
                        nuc_array = np.vstack(
                            (nuc_array,
                             np.fromstring(strain_seq_dt[strain], dtype='S1')))
                    else:
                        print 'Soft core gene: gene absent in strain %s on cluster %s' % (
                            strain, align_file)
                        nuc_array = np.vstack((nuc_array,
                                               np.fromstring(missing_gene_seq,
                                                             dtype='S1')))
            ## find SNP positions
            ## mask missing genes -- determine rows that have ' ' in every column
            is_missing = np.all(nuc_array == ' ', axis=1)
            masked_non_missing_array = np.ma.masked_array(
                nuc_array, nuc_array == ' ')
            position_polymorphic = np.any(
                masked_non_missing_array != masked_non_missing_array[0, :],
                axis=0)
            position_has_gap = np.any(masked_non_missing_array == '-', axis=0)
            position_SNP = position_polymorphic & (~position_has_gap)
            # the below seems duplicated from 5 lines above??
            if is_missing.sum() > 0:  # with missing genes
                nuc_array[is_missing] = '-'
            snp_columns = nuc_array[:, position_SNP]
            snp_pos_dt[align_file] = np.where(position_SNP)[0]
            #print snp_columns

        if snp_wh_matrix_flag == 0:
            snp_whole_matrix = snp_columns
            snp_wh_matrix_flag = 1
        else:
            snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns))
    write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt)

    with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa(outfile, refSeqList[ind], isw.tostring())
コード例 #19
0
def create_split_cluster_files(file_path, fname, gene_list1, gene_list2,
                               geneCluster_dt):
    """
    delete the old cluster and create two new clusters
    params:
        new_fa_files: list to which new file names are appeneded
        gene_list1/2: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
    """
    orgin_nwk_name = fname.split('/')[-1]
    clusterID = orgin_nwk_name.replace('.nwk', '')
    origin_cluster_nu_fa = orgin_nwk_name.replace('nwk', 'fna')
    origin_cluster_aa_fa = orgin_nwk_name.replace('nwk', 'faa')

    split_fa_files_set = set()
    ## load genes from old clusters
    origin_nu_fa_dt = read_fasta(file_path + origin_cluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path + origin_cluster_aa_fa)
    sgs_index = 0

    ## delete old (split) clusters
    try:
        #print('deleting:',orgin_nwk_name)
        ##debug:
        ##print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID)
        del geneCluster_dt[clusterID]
        with open(file_path + 'old_clusters_paralogSplit.txt',
                  'a') as delete_cluster_file:
            delete_cluster_file.write('%s\n' % clusterID)
        if os.path.exists(fname):
            suffix_list = [
                '_aa_aln.fa', '_na_aln.fa', '.fna', '.faa', '.nwk',
                '_tree.json'
            ]
        else:
            suffix_list = ['_aa_aln.fa', '_na_aln.fa', '.fna', '.faa']
        tmp_files = ' '.join(
            [file_path + clusterID + suffix for suffix in suffix_list])
        command_move_deleted_clusters = ' '.join(
            ['mv', tmp_files, file_path + 'paralog_splits/'])
        os.system(command_move_deleted_clusters)
    except:
        print("paralog splitting: can't delete", orgin_nwk_name)
        ##debug:
        ##print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID)

    ## write new cluster fa files
    ## split_gene_list has geneSeqID instead of geneID
    for split_gene_list in (list(gene_list1), list(gene_list2)):
        sgs_index += 1
        newClusterId = "%s_p%s" % (clusterID, sgs_index)
        gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna')
        gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa')
        gene_cluster_nu_write = open(file_path + gene_cluster_nu_filename,
                                     'wb')
        gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename,
                                     'wb')

        split_fa_files_set |= set([file_path + gene_cluster_nu_filename])

        ## write new split cluster files
        for gene_memb in split_gene_list:
            if "\\'" in gene_memb:
                gene_memb = gene_memb.replace("\\'", "'")
            try:
                write_in_fa(gene_cluster_nu_write, gene_memb,
                            origin_nu_fa_dt[gene_memb])
                write_in_fa(gene_cluster_aa_write, gene_memb,
                            origin_aa_fa_dt[gene_memb])
            except:
                print 'paralogy splitting (problem to write new split cluster files)', fname  #, gene_memb, gene_list1, gene_list2

        gene_cluster_nu_write.close()
        gene_cluster_aa_write.close()

        geneCluster_dt[newClusterId] = [0, [], 0]
        ## num_stains
        geneCluster_dt[newClusterId][0] = len(
            dict(Counter([ig.split('|')[0] for ig in split_gene_list])).keys())
        ## num_genes
        geneCluster_dt[newClusterId][2] = len(
            dict(Counter([ig for ig in split_gene_list])).keys())
        ## gene members
        geneCluster_dt[newClusterId][1] = [
            ig.split('-')[0] for ig in split_gene_list
        ]
    return split_fa_files_set
コード例 #20
0
def output_cutted_clusters(file_path, uncluster_filename, gene_list, cut_branch_threshold, treefile_used=None, cut_leftover=None):
    """
    delete the unclustered file and create new clusters
    params:
        gene_list: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
        cut_leftover: flag to indicate whether there are the leftover nodes
            after cutting long branches. Default: empty.
    """
    clusterID = uncluster_filename.replace('.fna','')
    origin_uncluster_nu_fa = uncluster_filename
    origin_uncluster_aa_fa = uncluster_filename.replace('fna','faa')

    new_fa_files=set()

    ## load origin cluster fa files
    origin_nu_fa_dt = read_fasta(file_path+origin_uncluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path+origin_uncluster_aa_fa)

    ## split_gene_list has geneSeqID instead of geneID
    for sgs_index,split_gene_list in enumerate(gene_list,1):
        if cut_leftover==True:
            ## newClusterId for the rest genes (_r as identifier)
            newClusterId="%s_r%s"%(clusterID,sgs_index)
        else:
            newClusterId="%s_%s"%(clusterID,sgs_index)

        #=============================================
        ## write new divided/split cluster files
        gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna')
        gene_cluster_nu_filepath= file_path+gene_cluster_nu_filename
        gene_cluster_nu_write=open(gene_cluster_nu_filepath , 'wb')

        gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa')
        gene_cluster_aa_filepath= file_path+gene_cluster_aa_filename
        gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb')

        for gene_memb in split_gene_list:
            if "\\'" in gene_memb: # Replace '\' in node name:
                ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID
                ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name
                ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError:
                ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'"
                gene_memb=gene_memb.replace("\\'","'")

            write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb])
            write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb])
        gene_cluster_nu_write.close(); gene_cluster_aa_write.close();
        #=============================================

        if cut_leftover==True:
            ## align the rest genes, build tree, cut long branches till nothing can be cutted.
            cutTree_outputCluster([gene_cluster_nu_filepath],file_path, cut_branch_threshold, treefile_used)
        else:
            ## record the misclusters to be deleted (already addressed in cutTree_outputCluster )
            ## it will output the same cluster several times
            #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file:
            #    delete_cluster_file.write('%s\n'%uncluster_filename)

            ## add record in new_clusters_longSplit.txt, which is used for align new clusters
            new_fa_files.add(gene_cluster_nu_filepath)

            ## write cluster statistics in folder update_long_branch_splits
            addin_geneCluster_dt=defaultdict(list)
            addin_geneCluster_dt[ newClusterId ] = [0,[],0]
            ## num_stains
            addin_geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys())
            ## num_genes
            addin_geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys())
            ## gene members
            addin_geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ]
            ## cPickle new cluster statistics
            write_pickle(''.join([file_path,'update_long_branch_splits/', newClusterId,'.cpk']),addin_geneCluster_dt)

    ## write records in gene_diversity file
    with open(file_path+'new_clusters_longSplit.txt', 'a') as refined_cluster_file:
        for i in new_fa_files:
            refined_cluster_file.write('%s\n'%i)
コード例 #21
0
def output_cutted_clusters(file_path,
                           uncluster_filename,
                           gene_list,
                           cut_branch_threshold,
                           treefile_used=None,
                           cut_leftover=None):
    """
    delete the unclustered file and create new clusters
    params:
        gene_list: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
        cut_leftover: flag to indicate whether there are the leftover nodes
            after cutting long branches. Default: empty.
    """
    clusterID = uncluster_filename.replace('.fna', '')
    origin_uncluster_nu_fa = uncluster_filename
    origin_uncluster_aa_fa = uncluster_filename.replace('fna', 'faa')

    new_fa_files = set()

    ## load origin cluster fa files
    origin_nu_fa_dt = read_fasta(file_path + origin_uncluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path + origin_uncluster_aa_fa)

    ## split_gene_list has geneSeqID instead of geneID
    for sgs_index, split_gene_list in enumerate(gene_list, 1):
        if cut_leftover == True:
            ## newClusterId for the rest genes (_r as identifier)
            newClusterId = "%s_r%s" % (clusterID, sgs_index)
        else:
            newClusterId = "%s_%s" % (clusterID, sgs_index)

        #=============================================
        ## write new divided/split cluster files
        gene_cluster_nu_filename = "%s%s" % (newClusterId, '.fna')
        gene_cluster_nu_filepath = file_path + gene_cluster_nu_filename
        gene_cluster_nu_write = open(gene_cluster_nu_filepath, 'wb')

        gene_cluster_aa_filename = "%s%s" % (newClusterId, '.faa')
        gene_cluster_aa_filepath = file_path + gene_cluster_aa_filename
        gene_cluster_aa_write = open(file_path + gene_cluster_aa_filename,
                                     'wb')

        for gene_memb in split_gene_list:
            if "\\'" in gene_memb:  # Replace '\' in node name:
                ## NC_018495|CM9_RS01675-1-guanosine-3',5'-... in fasta ID
                ## 'NC_018495|CM9_RS01675-1-guanosine-3\',5\'-...' in nwk node name
                ## Use origin_nu_fa_dt[gene_memb] will throw the KeyError:
                ## "NC_018495|CM9_RS01675-1-guanosine-3\\',5\\'"
                gene_memb = gene_memb.replace("\\'", "'")

            write_in_fa(gene_cluster_nu_write, gene_memb,
                        origin_nu_fa_dt[gene_memb])
            write_in_fa(gene_cluster_aa_write, gene_memb,
                        origin_aa_fa_dt[gene_memb])
        gene_cluster_nu_write.close()
        gene_cluster_aa_write.close()
        #=============================================

        if cut_leftover == True:
            ## align the rest genes, build tree, cut long branches till nothing can be cutted.
            cutTree_outputCluster([gene_cluster_nu_filepath], file_path,
                                  cut_branch_threshold, treefile_used)
        else:
            ## record the misclusters to be deleted (already addressed in cutTree_outputCluster )
            ## it will output the same cluster several times
            #with open(file_path+'old_clusters_longSplit.txt', 'a') as delete_cluster_file:
            #    delete_cluster_file.write('%s\n'%uncluster_filename)

            ## add record in new_clusters_longSplit.txt, which is used for align new clusters
            new_fa_files.add(gene_cluster_nu_filepath)

            ## write cluster statistics in folder update_long_branch_splits
            addin_geneCluster_dt = defaultdict(list)
            addin_geneCluster_dt[newClusterId] = [0, [], 0]
            ## num_stains
            addin_geneCluster_dt[newClusterId][0] = len(
                dict(Counter([ig.split('|')[0]
                              for ig in split_gene_list])).keys())
            ## num_genes
            addin_geneCluster_dt[newClusterId][2] = len(
                dict(Counter([ig for ig in split_gene_list])).keys())
            ## gene members
            addin_geneCluster_dt[newClusterId][1] = [
                ig.split('-')[0] for ig in split_gene_list
            ]
            ## cPickle new cluster statistics
            write_pickle(
                ''.join([
                    file_path, 'update_long_branch_splits/', newClusterId,
                    '.cpk'
                ]), addin_geneCluster_dt)

    ## write records in gene_diversity file
    with open(file_path + 'new_clusters_longSplit.txt',
              'a') as refined_cluster_file:
        for i in new_fa_files:
            refined_cluster_file.write('%s\n' % i)
コード例 #22
0
def create_split_cluster_files(file_path, fname,
    gene_list1, gene_list2, geneCluster_dt):
    """
    delete the old cluster and create two new clusters
    params:
        new_fa_files: list to which new file names are appeneded
        gene_list1/2: lists containing the genes in the new split clusters
        geneCluster_dt: cluster dictionary to be updated
    """
    orgin_nwk_name = fname.split('/')[-1]
    clusterID = orgin_nwk_name.replace('.nwk','')
    origin_cluster_nu_fa = orgin_nwk_name.replace('nwk','fna')
    origin_cluster_aa_fa = orgin_nwk_name.replace('nwk','faa')

    split_fa_files_set=set()
    ## load genes from old clusters
    origin_nu_fa_dt = read_fasta(file_path+origin_cluster_nu_fa)
    origin_aa_fa_dt = read_fasta(file_path+origin_cluster_aa_fa)
    sgs_index=0

    ## delete old (split) clusters
    try:
        #print('deleting:',orgin_nwk_name)
        ##debug:
        ##print('deleting:',orgin_nwk_name,gene_list1,gene_list2, clusterID)
        del geneCluster_dt[clusterID]
        with open(file_path+'old_clusters_paralogSplit.txt', 'a') as delete_cluster_file:
            delete_cluster_file.write('%s\n'%clusterID)
        if os.path.exists(fname):
            suffix_list=['_aa_aln.fa','_na_aln.fa','.fna','.faa','.nwk','_tree.json']
        else:
            suffix_list=['_aa_aln.fa','_na_aln.fa','.fna','.faa']
        tmp_files=' '.join([ file_path+clusterID+suffix for suffix in suffix_list ])
        command_move_deleted_clusters=' '.join(['mv', tmp_files, file_path+'paralog_splits/'])
        os.system(command_move_deleted_clusters)
    except:
        print("paralog splitting: can't delete",orgin_nwk_name)
        ##debug:
        ##print("can't delete",orgin_nwk_name,gene_list1,gene_list2, clusterID)

    ## write new cluster fa files
    ## split_gene_list has geneSeqID instead of geneID
    for split_gene_list in (list(gene_list1), list(gene_list2)):
        sgs_index+=1
        newClusterId="%s_p%s"%(clusterID,sgs_index)
        gene_cluster_nu_filename="%s%s"%(newClusterId,'.fna')
        gene_cluster_aa_filename="%s%s"%(newClusterId,'.faa')
        gene_cluster_nu_write=open( file_path+gene_cluster_nu_filename, 'wb')
        gene_cluster_aa_write=open( file_path+gene_cluster_aa_filename, 'wb')

        split_fa_files_set |=  set([file_path+gene_cluster_nu_filename])

        ## write new split cluster files
        for gene_memb in split_gene_list:
            if "\\'" in gene_memb:
                gene_memb=gene_memb.replace("\\'","'")
            try:
                write_in_fa(gene_cluster_nu_write, gene_memb, origin_nu_fa_dt[gene_memb])
                write_in_fa(gene_cluster_aa_write, gene_memb, origin_aa_fa_dt[gene_memb])
            except:
                print 'paralogy splitting (problem to write new split cluster files)', fname #, gene_memb, gene_list1, gene_list2

        gene_cluster_nu_write.close(); gene_cluster_aa_write.close();

        geneCluster_dt[ newClusterId ] = [0,[],0]
        ## num_stains
        geneCluster_dt[ newClusterId ][0]=len(dict(Counter([ ig.split('|')[0] for ig in split_gene_list])).keys())
        ## num_genes
        geneCluster_dt[ newClusterId ][2]=len(dict(Counter([ ig for ig in split_gene_list])).keys())
        ## gene members
        geneCluster_dt[ newClusterId ][1]=[ ig.split('-')[0] for ig in split_gene_list ]
    return split_fa_files_set
コード例 #23
0
def gbk_translation(strainID, gbk_fname, protein_fname, nucleotide_fname, RNA_fname,
    geneID_to_geneSeqID_dict,geneID_to_description_dict,
    RNAID_to_SeqID_dict, RNAID_to_description_dict,
    gene_aa_dict, gene_na_dict, RNA_dict, enable_RNA_clustering):
    '''
    extract sequences and meta informations of all genes in one reference genbank file
    params:
        - gbk_fname:        Genbank filename
        - protein_fname:  file into which all amino acid sequences are written
                            in fasta format. needed as input for diamond
        - nucleotide_fname: file into which all nucleotide sequences are written
                            in fasta format. needed for cluster sequences
        - RNA_fname: RNA nucleotide_sequences are written in fasta format.
                            Needed as RNA_blast_input
        - geneID_to_geneSeqID_dict: dictionary linking geneID to gene sequence ID
                            modified in place (key: geneID; value: geneSeqID )
        - geneID_to_description_dict: dictionary linking geneID to description info
                            modified in place (key: geneID; value: a dict including
                            information on contig_index, annotation or more)
        - RNAID_to_SeqID_dict: dictionary linking RNAID to RNA sequence ID
                            modified in place (key: RNAID; value: SeqID )
        - RNAID_to_description_dict: dictionary linking RNAID to description info
                            modified in place (key: RNAID; value: a dict including
                            information on contig_index, annotation or more)
        - enable_RNA_clustering: not cluster rRNA
    '''

    aa_sequence_file=open(protein_fname, 'wb')
    nu_sequence_file=open(nucleotide_fname, 'wb')

    if enable_RNA_clustering:
        RNA_sequence_file=open(RNA_fname, 'wb')

    contig_index=0
    check_CDS_passed=0
    for contig in SeqIO.parse(gbk_fname,'genbank'):
        contig_index+=1
        for feature in contig.features:
            if feature.type=='CDS':
                if not check_CDS_passed:
                    check_CDS_passed=1
                if 'product' in feature.qualifiers and 'translation' in feature.qualifiers :
                    if 'gene' in feature.qualifiers :
                        geneName='%s'%(feature.qualifiers['gene'][0]).replace(' ','_')
                    else: geneName=''
                    product=feature.qualifiers['product'][0]
                    annotation= '_'.join(product.split(' '))
                    trans_seq=feature.qualifiers['translation'][0]
                    if 'locus_tag' in feature.qualifiers:
                        locus_tag=feature.qualifiers['locus_tag'][0]
                    else:
                        locus_tag=feature.qualifiers['db_xref'][0].split(':')[1]
                    ## force to replace '-' with '_' in locus_tag
                    if '-' in locus_tag:
                        locus_tag=locus_tag.replace('-','_')
                    if "PROKKA" in locus_tag:
                        locus_tag=locus_tag.replace('PROKKA_','')
                    if '%s_'%strainID in locus_tag:
                        locus_tag=locus_tag.split('%s_'%strainID)[1]
                    ## geneID is composed of strain_name and locus_tag
                    ## Keeping '|' separator is important, which is used later in orthAgogue.
                    geneID= '%s|%s'%(strainID,locus_tag)
                    na_seq=str(feature.extract(contig.seq))
                    write_in_fa(aa_sequence_file, geneID, trans_seq)
                    write_in_fa(nu_sequence_file, geneID, na_seq)
                    gene_aa_dict[strainID][geneID]=trans_seq
                    gene_na_dict[strainID][geneID]=na_seq
                    # give tag 'gname:' to genes which have gene name and separate it from annotation
                    geneID_to_description_dict[geneID]={'geneName': geneName,
                                                        'contig': contig_index,
                                                        'annotation': annotation}
                    if geneName!='':
                        geneName='%s_'%geneName
                    geneID_to_geneSeqID_dict[geneID]='%s|%s-%d-%s%s'%(strainID,
                                                    locus_tag, contig_index,
                                                    geneName, annotation)
            elif enable_RNA_clustering and (feature.type=='rRNA'):
            #elif not enable_RNA_clustering and (feature.type=='rRNA' or feature.type=='tRNA'):
                if 'product' in feature.qualifiers:
                    geneName=''
                    product=feature.qualifiers['product'][0]
                    annotation= '_'.join(product.split(' '))
                    try:
                        locus_tag=feature.qualifiers['locus_tag'][0]
                    except: # make a random string when locus_tag absent
                        locus_tag=time.strftime('%S',time.gmtime())+str(random.randint(0,10000000))
                    if "PROKKA" in locus_tag:
                        locus_tag=locus_tag.replace('PROKKA_','')
                    if '%s_'%strainID in locus_tag:
                        locus_tag=locus_tag.split('%s_'%strainID)[1]
                    ## RNA is composed of strain_name and locus_tag
                    ## Keeping '|' separator is important, which is used later in orthAgogue.
                    RNAID= '%s|%s'%(strainID,locus_tag)
                    RNA_seq= str(feature.extract(contig.seq))
                    write_in_fa(RNA_sequence_file, RNAID, RNA_seq)
                    RNA_dict[strainID][RNAID]=RNA_seq
                    # give tag 'gname:' to genes which have gene name and separate it from annotation
                    RNAID_to_description_dict[RNAID]={
                                                    'geneName': '',
                                                    'contig': contig_index,
                                                    'annotation': annotation}
                    RNAID_to_SeqID_dict[RNAID]='%s|%s-%d-%s%s'%(strainID,
                                                    locus_tag, contig_index,
                                                    geneName, annotation)

    aa_sequence_file.close(); nu_sequence_file.close()

    return check_CDS_passed