Ejemplo n.º 1
0
def export_gain_loss(tree, path):
    '''
    '''
    # write final tree with internal node names as assigned by treetime
    sep = '/'
    output_path = sep.join([path.rstrip(sep), 'geneCluster/'])
    tree_fname = sep.join([output_path, 'tree_result.newick'])
    Phylo.write(tree.tree, tree_fname, 'newick')

    from collections import defaultdict
    gene_gain_loss_dict = defaultdict(str)
    for node in tree.tree.find_clades(
            order='preorder'):  # order does not matter much here
        if node.up is None: continue
        #print(node.name ,len(node.geneevents),node.geneevents)
        gain_loss = [
            str(int(ancestral) * 2 + int(derived)) for ancestral, derived in
            zip(node.up.genepresence, node.genepresence)
        ]
        gene_gain_loss_dict[node.name] = "".join(gain_loss)

    gain_loss_array = np.array(
        [[i for i in gain_loss_str]
         for gain_loss_str in gene_gain_loss_dict.values()],
        dtype=int)
    # 1 and 2 are codes for gain/loss events
    events_array = ((gain_loss_array == 1) |
                    (gain_loss_array == 2)).sum(axis=0)
    events_dict = {index: event for index, event in enumerate(events_array)}
    events_dict_path = sep.join([output_path, 'dt_geneEvents.cpk'])
    write_pickle(events_dict_path, events_dict)

    # export gene loss dict to json for visualization
    gene_loss_fname = sep.join([output_path, 'geneGainLossEvent.json'])
    write_json(gene_gain_loss_dict, gene_loss_fname, indent=1)
Ejemplo n.º 2
0
def make_genepresence_alignment(path):
    '''
    loop over all gene clusters and append 0/1 to strain specific
    string used as pseudo alignment of gene presence absence
    '''
    geneClusterPath = '%s%s' % (path, 'protein_fna/diamond_matches/')
    output_path = '%s%s' % (path, 'geneCluster/')

    ## load strain list and prepare for gene presence/absence
    strain_list = load_pickle(path + 'strain_list.cpk')
    set_totalStrain = set([istrain for istrain in strain_list])
    totalStrain = len(set_totalStrain)
    dt_strainGene = defaultdict(list)

    sorted_genelist = load_sorted_clusters(path)
    ## sorted_genelist: [(clusterID, [ count_strains,[memb1,...],count_genes]),...]
    for gid, (clusterID, gene) in enumerate(sorted_genelist):
        gene_list = gene[1]
        ## append 0/1 to each strain
        dt_strainGene = create_genePresence(dt_strainGene, totalStrain,
                                            set_totalStrain, gene_list)

    with open(output_path + 'genePresence.aln', 'wb') as presence_outfile:
        for istkey in dt_strainGene:
            dt_strainGene[istkey] = ''.join(dt_strainGene[istkey])
            write_in_fa(presence_outfile, istkey, dt_strainGene[istkey])

    write_pickle(output_path + 'dt_genePresence.cpk', dt_strainGene)
def update_gene_cluster_with_RNA(path, diamond_RNACluster_dt,
                                 diamond_geneCluster_dt):
    ## update gene cluster pickled file
    cluster_path = path + 'protein_faa/diamond_matches/'

    diamond_geneCluster_dt.update(diamond_RNACluster_dt)

    write_pickle(cluster_path + 'orthamcl-allclusters_final.cpk',
                 diamond_geneCluster_dt)
Ejemplo n.º 4
0
def diamond_input(path, strain_lst, disable_RNA_clustering=0):
    '''
        go through all GenBank files and extract sequences and metadata for each one
    '''
    each_gbk_path = '%s%s' % (path, 'input_GenBank/')
    os.system('mkdir %s;mv %s*gbk %s' % (each_gbk_path, path, each_gbk_path))
    protein_folder = '%s%s' % (path, 'protein_faa/')
    os.system('mkdir %s' % protein_folder)
    nucleotide_dict_path = '%s%s' % (path, 'nucleotide_fna/')
    os.system('mkdir %s' % nucleotide_dict_path)
    RNA_folder = '%s%s' % (path, 'RNA_fna/')
    os.system('mkdir %s' % RNA_folder)
    ## CDS
    geneID_to_geneSeqID_file = path + 'geneID_to_geneSeqID.cpk'
    geneID_to_geneSeqID_dict = defaultdict()
    geneID_to_description_file = path + 'geneID_to_description.cpk'
    geneID_to_description_dict = defaultdict()
    ## RNA
    RNAID_to_SeqID_file = path + 'RNAID_to_SeqID.cpk'
    RNAID_to_SeqID_dict = defaultdict()
    RNAID_to_description_file = path + 'RNAID_to_description.cpk'
    RNAID_to_description_dict = defaultdict()
    for strain_name in strain_lst:
        diamond_input_fname = protein_folder + '%s%s' % (strain_name, '.faa')
        RNA_blast_input_fname = RNA_folder + '%s%s' % (strain_name, '.fna')
        gbk_translation(each_gbk_path, nucleotide_dict_path,
                        '%s%s' % (strain_name, '.gbk'), diamond_input_fname,
                        RNA_blast_input_fname, geneID_to_geneSeqID_dict,
                        geneID_to_description_dict, RNAID_to_SeqID_dict,
                        RNAID_to_description_dict, disable_RNA_clustering)
    write_pickle(geneID_to_geneSeqID_file, geneID_to_geneSeqID_dict)
    write_pickle(geneID_to_description_file, geneID_to_description_dict)
    if disable_RNA_clustering == 0:
        write_pickle(RNAID_to_SeqID_file, RNAID_to_SeqID_dict)
        write_pickle(RNAID_to_description_file, RNAID_to_description_dict)
Ejemplo n.º 5
0
def parse_RNACluster(path, inputfile):
    """ store clusters as dictionary in cpk file """
    from operator import itemgetter
    inputfile = "%s%s" % (path, inputfile)
    with open(inputfile, 'rb') as infile:
        RNACluster_dt = defaultdict(list)
        for gid, iline in enumerate(
                infile):  ##format: NC_022226|1-1956082:1956435
            col = iline.rstrip().split('\t')
            clusterID = "GC_RNA%03d" % gid
            RNACluster_dt[clusterID] = [0, [], 0]
            ## num_stains
            RNACluster_dt[clusterID][0] = len(
                dict(Counter([ivg.split('|')[0] for ivg in col])).keys())
            ## num_RNAs
            RNACluster_dt[clusterID][2] = len(
                dict(Counter([ivg for ivg in col])).keys())
            ## RNA members
            RNACluster_dt[clusterID][1] = [icol for icol in col]
    write_pickle(path + 'orthamcl-allclusters.cpk', RNACluster_dt)
def parse_geneCluster(path,inputfile, cluster_log=False):
    """ store clusters as dictionary in cpk file """
    from operator import itemgetter
    inputfile="%s%s"%(path,inputfile)
    with open(inputfile, 'rb') as infile:
        geneCluster_dt=defaultdict(list)
        for gid, iline in enumerate(infile): ##format: NC_022226|1-1956082:1956435
            col=iline.rstrip().split('\t')
            clusterID="GC_%08d"%gid
            geneCluster_dt[clusterID]=[0,[],0]
            ## num_stains
            geneCluster_dt[clusterID][0]=len(dict(Counter([ ivg.split('|')[0] for ivg in col])).keys())
            ## num_genes
            geneCluster_dt[clusterID][2]=len(dict(Counter([ ivg for ivg in col])).keys())
            ## gene members
            geneCluster_dt[clusterID][1]=[ icol for icol in col ]
    write_pickle(path+'orthamcl-allclusters.cpk',geneCluster_dt)

    if cluster_log==True:
        with open(path+'orthamcl-allclusters.log', 'wb') as write_fn_lst:
            orthagogue_geneCount_lst=sorted( geneCluster_dt.iteritems(), key=itemgetter(1), reverse=True);
            for kd, vd in orthagogue_geneCount_lst:
                write_fn_lst.write('%s%s\n'%(kd, vd));
Ejemplo n.º 7
0
def load_strains():
    """ load input strains in strain_list """
    if os.path.isfile(path+strain_list):
        with open(path+strain_list,'rb') as infile:
            write_pickle(path+'strain_list.cpk', [ ist.rstrip().split('.gbk')[0] for ist in infile] )
def create_core_SNP_matrix(path):
    """ create SNP matrix using core gene SNPs
        input: strain_list.cpk, core_geneList.cpk
        output: SNP_whole_matrix.aln
    """
    import os, sys, operator
    import numpy as np
    from collections import defaultdict
    from SF00_miscellaneous import read_fasta, write_pickle, load_pickle, write_in_fa

    alnFilePath = '%s%s' % (path, 'geneCluster/')
    output_path = alnFilePath

    ## create core gene list
    corelist = []
    totalStrain = len(load_pickle(path + 'strain_list.cpk'))
    sorted_geneList = load_sorted_clusters(path)
    with open(output_path + 'core_geneList.txt', 'wb') as outfile:
        for clusterID, vg in sorted_geneList:
            if vg[0] == totalStrain and vg[2] == totalStrain:
                coreGeneName = '%s%s' % (clusterID, '_na.aln')
                ## sequences might be discarded because of premature stops
                coreGeneName_path = alnFilePath + coreGeneName
                if os.path.exists(coreGeneName_path) and len(
                        read_fasta(coreGeneName_path)) == totalStrain:
                    outfile.write(coreGeneName + '\n')
                    corelist.append(coreGeneName)
                else:
                    print '%s%s%s' % ('warning: ', coreGeneName_path,
                                      ' is not a core gene')
        write_pickle(output_path + 'core_geneList.cpk', corelist)

    refSeqList = load_pickle(path + 'strain_list.cpk')
    refSeqList.sort()

    snp_fre_lst = []
    snp_wh_matrix_flag = 0
    snp_pos_dt = defaultdict(list)
    snp_whole_matrix = np.array([])

    snps_by_gene = []
    for align_file in corelist:  ## all core genes
        fa_dt = read_fasta(alnFilePath + align_file)
        fa_sorted_lst = sorted(fa_dt.items(), key=lambda x: x[0].split('|')[0])
        nuc_array = np.array([])
        flag = 0
        for ka, va in enumerate(fa_sorted_lst):
            if flag == 0:
                flag = 1
                nuc_array = np.array(np.fromstring(va[1], dtype='S1'))
            else:
                nuc_array = np.vstack(
                    (nuc_array, np.fromstring(va[1], dtype='S1')))

        position_polymorphic = np.where(
            np.all(nuc_array == nuc_array[0, :], axis=0) == False)[0]
        position_has_gap = np.where(np.any(nuc_array == '-', axis=0))[0]
        position_SNP = np.setdiff1d(position_polymorphic, position_has_gap)
        snp_columns = nuc_array[:, position_SNP]
        snp_pos_dt[align_file] = position_SNP

        if snp_wh_matrix_flag == 0:
            snp_whole_matrix = snp_columns
            snp_wh_matrix_flag = 1
        else:
            snp_whole_matrix = np.hstack((snp_whole_matrix, snp_columns))

    write_pickle(output_path + 'snp_pos.cpk', snp_pos_dt)

    with open(output_path + 'SNP_whole_matrix.aln', 'wb') as outfile:
        for ind, isw in enumerate(snp_whole_matrix):
            write_in_fa(outfile, refSeqList[ind], isw.tostring())
def diamond_orthamcl_cluster(path, threads, blast_cluster_file_path='none', roary_cluster_file_path='none',diamond_orthamcl_cluster='600', mcl_inflation=2.0):
    ''' 
    make all-against-all comparison using diamond
    THEN generate gene clusters followed by orthoMCL/orthagogue+MCL
    OR use the output of all-to-all blast comparison and orthoMCL/orthagogue+MCL
    OR use the output of roary
    params:
        path:                    path to directory including data and output
        threads:                 number of parallel threads used to run diamond
        blast_cluster_file_path: gene clusters by all-to-all blast 
                                 comparison and orthoMCL/orthagogue+MCL
        roary_cluster_file_path: gene clusters by roary
        diamond_orthamcl_cluster: Diamond setting: the maximum number of target sequences 
                                  per query to keep alignments for. Defalut: 
                                  #strain * #max_duplication= 40*15= 600 
    '''

    for exe in ['orthAgogue', 'mcl']:
        check_exe(exe)

    input_path=path+'protein_faa/';
    output_path=input_path+'diamond_matches/';
    threads=str(threads)
    ## using standard pipeline (roary_cluster_file_path=='none')
    if roary_cluster_file_path=='none':
        if blast_cluster_file_path=='none':
            dmd_ref_file='reference.faa'; dmd_query_file='query.faa'
            ## prepare dmd_query_file
            os.system('mkdir '+output_path)
            os.system('cat '+input_path+'*faa > '+output_path+dmd_query_file)
            ## dmd_query_file is dmd_ref_file
            os.system('cp '+output_path+dmd_query_file+' '+output_path+dmd_ref_file)
            diamond_run(output_path, output_path, dmd_ref_file, threads, diamond_orthamcl_cluster)
            ortha_mcl_run(output_path, threads, mcl_inflation)
            ## save singeltons
            origin_cluster_file='orthamcl-cluster.output';
            orthagogue_singletons(output_path,origin_cluster_file,dmd_query_file)
            ## clean up diamond_query_file
            os.system(''.join(['rm ',output_path,'*faa']))
            all_cluster_file='orthamcl-allclusters.csv';
            parse_geneCluster(output_path,all_cluster_file)
        else: ## using user-given cluster file based on blast
            os.system('mkdir %s'%output_path)
            os.system('ln -sf %s %sclustered_proteins'%(blast_cluster_file_path, output_path))
            from operator import itemgetter
            ## create gene cluster from blast output
            with open(blast_cluster_file_path, 'rb') as infile:
                geneCluster_dt=defaultdict(list)
                for gid, iline in enumerate(infile):
                    column=[  ico.replace('_','|') for ico in iline.rstrip().split('\t') ]
                    clusterID="GC_%08d"%gid
                    gene_list=[ ico for ico in column ]
                    geneCluster_dt[clusterID]=[0,[],0]
                    num_stains=len( dict(Counter([ ivg.split('|')[0] for ivg in gene_list ])) )
                    num_gene=len(dict(Counter([ ivg for ivg in column])))
                    geneCluster_dt[ clusterID ][0]=num_stains
                    geneCluster_dt[ clusterID ][2]=num_gene
                    geneCluster_dt[ clusterID ][1]=gene_list
            write_pickle(output_path+'orthamcl-allclusters.cpk', geneCluster_dt)

            orthagogue_geneCount_lst=sorted( geneCluster_dt.iteritems(), key=itemgetter(1), reverse=True)
            with open(output_path+'orthamcl-allclusters.log', 'wb') as write_fn_lst:
                for kd, vd in orthagogue_geneCount_lst:
                    write_fn_lst.write('%s%s\n'%(kd, vd))
    else: ## using cluster files from roary
        os.system('mkdir %s'%output_path)
        os.system('ln -sf %s %sclustered_proteins'%(roary_cluster_file_path, output_path))
        with open(roary_cluster_file_path, 'rb') as cluster_external_file:
            with open(output_path+'orthamcl-allclusters.csv', 'wb') as cluster_final_file:
                for cluster_line in cluster_external_file:
                     cluster_final_file.write( '%s\n'%'\t'.join([ gene_tag.replace('_','|') if '|' not in gene_tag else gene_tag for gene_tag in cluster_line.rstrip().split(': ')[1].split('\t')]) )
        all_cluster_file='orthamcl-allclusters.csv';
        parse_geneCluster(output_path,all_cluster_file)
def update_diversity_cpk_file(path):
    ## write gene_diversity_Dt cpk file
    output_path = path+'geneCluster/'
    with open(output_path+'gene_diversity.txt', 'rb') as infile:
        write_pickle(output_path+'gene_diversity.cpk',{ i.rstrip().split('\t')[0]:i.rstrip().split('\t')[1] for i in infile})
Ejemplo n.º 11
0
def gbk_translation(each_gbk_path, nucleotide_dict_path, gb_file,
                    output_filename, output_filename2,
                    geneID_to_geneSeqID_dict, geneID_to_description_dict,
                    RNAID_to_SeqID_dict, RNAID_to_description_dict,
                    disable_RNA_clustering):
    '''
    extract sequences and meta informations of all genes in one reference genbank file
    params:
        - each_gbk_path:    path to the set of reference sequences used to construct
                            the core genome
        - nucleotide_dict_path:
                            path to the cPickled dicts of all nucleotide sequences 
                            for each genome
        - gb_file:          name of the reference to be analyzed
        - output_filename:  file into which all amino acid sequences are written
                            in fasta format. needed as input for diamond
        - output_filename2: RNA nucleotide_sequences are written in fasta format.
                            Needed as RNA_blast_input
        - geneID_to_geneSeqID_dict: dictionary linking geneID to gene sequence ID
                            modified in place (key: geneID; value: geneSeqID )
        - geneID_to_description_dict: dictionary linking geneID to description info
                            modified in place (key: geneID; value: a dict including
                            information on contig_index, annotation or more)
        - RNAID_to_SeqID_dict: dictionary linking RNAID to RNA sequence ID
                            modified in place (key: RNAID; value: SeqID )
        - RNAID_to_description_dict: dictionary linking RNAID to description info
                            modified in place (key: RNAID; value: a dict including
                            information on contig_index, annotation or more)
        - disable_RNA_clustering: not cluster rRNA and tRNA (default: 0 -> cluster RNAs)
    '''

    reference_gb = '%s%s' % (each_gbk_path, gb_file)
    strainName = gb_file.split('.gbk')[0]
    gene_nuc_seq_dict = '%s%s_gene_nuc_dict.cpk' % (nucleotide_dict_path,
                                                    strainName)
    gene_nucleotide_sequences = defaultdict()
    aa_sequence_file = open(output_filename, 'wb')

    if disable_RNA_clustering == 0:
        RNA_nuc_seq_dict = '%s%s_RNA_nuc_dict.cpk' % (nucleotide_dict_path,
                                                      strainName)
        RNA_nucleotide_sequences = defaultdict()
        RNA_sequence_file = open(output_filename2, 'wb')

    contig_index = 0
    for contig in SeqIO.parse(reference_gb, 'genbank'):
        contig_index += 1
        for feature in contig.features:
            if feature.type == 'CDS':
                if 'product' in feature.qualifiers and 'translation' in feature.qualifiers:
                    if 'gene' in feature.qualifiers:
                        geneName = '%s' % (
                            feature.qualifiers['gene'][0]).replace(' ', '_')
                    else:
                        geneName = ''
                    product = feature.qualifiers['product'][0]
                    annotation = '_'.join(product.split(' '))
                    trans_seq = feature.qualifiers['translation'][0]
                    locus_tag = feature.qualifiers['locus_tag'][0]
                    if "PROKKA" in locus_tag:
                        locus_tag = locus_tag.replace('PROKKA_', '')
                    if '%s_' % strainName in locus_tag:
                        locus_tag = locus_tag.split('%s_' % strainName)[1]
                    ## geneID is composed of strain_name and locus_tag
                    ## Keeping '|' separator is important, which is used later in orthAgogue.
                    geneID = '%s|%s' % (strainName, locus_tag)
                    write_in_fa(aa_sequence_file, geneID, trans_seq)
                    # give tag 'gname:' to genes which have gene name and separate it from annotation
                    geneID_to_description_dict[geneID] = {
                        'geneName': geneName,
                        'contig': contig_index,
                        'annotation': annotation
                    }
                    if geneName != '':
                        geneName = '%s_' % geneName
                    geneID_to_geneSeqID_dict[geneID] = '%s|%s-%d-%s%s' % (
                        strainName, locus_tag, contig_index, geneName,
                        annotation)

                    gene_nucleotide_sequences[geneID] = feature.extract(
                        contig.seq)
            elif not disable_RNA_clustering and (feature.type == 'rRNA'
                                                 or feature.type == 'tRNA'):
                if 'product' in feature.qualifiers:
                    geneName = ''
                    product = feature.qualifiers['product'][0]
                    annotation = '_'.join(product.split(' '))
                    locus_tag = feature.qualifiers['locus_tag'][0]
                    if "PROKKA" in locus_tag:
                        locus_tag = locus_tag.replace('PROKKA_', '')
                    if '%s_' % strainName in locus_tag:
                        locus_tag = locus_tag.split('%s_' % strainName)[1]
                    ## RNA is composed of strain_name and locus_tag
                    ## Keeping '|' separator is important, which is used later in orthAgogue.
                    RNAID = '%s|%s' % (strainName, locus_tag)
                    RNA_seq = str(feature.extract(contig.seq))
                    write_in_fa(RNA_sequence_file, RNAID, RNA_seq)
                    # give tag 'gname:' to genes which have gene name and separate it from annotation
                    RNAID_to_description_dict[RNAID] = {
                        'geneName': '',
                        'contig': contig_index,
                        'annotation': annotation
                    }
                    RNAID_to_SeqID_dict[RNAID] = '%s|%s-%d-%s%s' % (
                        strainName, locus_tag, contig_index, geneName,
                        annotation)
                    RNA_nucleotide_sequences[RNAID] = RNA_seq

    write_pickle(gene_nuc_seq_dict, gene_nucleotide_sequences)
    if disable_RNA_clustering == 0:
        write_pickle(RNA_nuc_seq_dict, RNA_nucleotide_sequences)
    aa_sequence_file.close()