Beispiel #1
0
percentile = []
median = []
mean = []
sigma = []
idx_for_ribo = []
ribo_count_for_df = [] 
#
pid_cai_list = []
for idx,ribo_count in ribo_cai_info.itertuples(index=False):
    if ribo_count >= RIBO_LIMIT:
        cds_dat = orgs.get_group(idx)
        ribo_cds = cds_dat[cds_dat['ribosomal']]['cDNA'] # cDNA of ribosomal proteins ...
        codon_usage = cairi.count_codons(ribo_cds)
        codon_index = cairi.generate_codon_index(codon_usage,genetic_table=list(cds_dat['table'])[0]) # fix that ...
        # we need to track index from 'dat', as there are some stupid duplications ...
        pid_cai = pd.DataFrame(((dat_idx,pid,cairi.cai_for_gene(sequence,codon_index)) for dat_idx,pid,sequence in cds_dat[['pid','cDNA']].itertuples()),columns=['dat_idx','pid','CAI'])
        pid_cai = pid_cai.set_index(keys='dat_idx')
        # characterize CAI distribution for a given organism ...
        local_mean = pid_cai['CAI'].mean()
        local_median = pid_cai['CAI'].median()
        local_sigma = pid_cai['CAI'].std()
        mean.append(local_mean)
        median.append(local_median)
        sigma.append(local_sigma)
        idx_for_ribo.append(idx)
        ribo_count_for_df.append(ribo_count)
        #
        local_ribo_indexes = cds_dat['ribosomal'].nonzero()[0]
        local_ribo = pid_cai.iloc[local_ribo_indexes].reset_index(drop=True) 
        # let's also check our t.o. score
        qH_all = pid_cai['CAI'].quantile(q=0.75)
Beispiel #2
0
median = []
mean = []
sigma = []
idx_for_prot = []
#
pid_cai_list = []
for idx in genom_id:
    cds_dat = orgs.get_group(idx)
    # instead of taking specific group of proteins, let's take RANDOM sample of 50 cDNA ...
    prot_cds = cds_dat['cDNA'].sample(PROT_COUNT)  # cDNA sample proteins ...
    codon_usage = cairi.count_codons(prot_cds)
    codon_index = cairi.generate_codon_index(
        codon_usage, genetic_table=list(cds_dat['table'])[0])  # fix that ...
    # we need to track index from 'dat', as there are some stupid duplications ...
    pid_cai = pd.DataFrame(
        ((dat_idx, pid, cairi.cai_for_gene(sequence, codon_index))
         for dat_idx, pid, sequence in cds_dat[['pid', 'cDNA']].itertuples()),
        columns=['dat_idx', 'pid', 'CAI'])
    pid_cai = pid_cai.set_index(keys='dat_idx')
    # characterize CAI distribution for a given organism ...
    local_mean = pid_cai['CAI'].mean()
    local_median = pid_cai['CAI'].median()
    local_sigma = pid_cai['CAI'].std()
    mean.append(local_mean)
    median.append(local_median)
    sigma.append(local_sigma)
    idx_for_prot.append(idx)
    # get relative indexes of sampled cDNAs in 'cds_dat' ...
    local_prot_indexes = prot_cds.index - cds_dat.index[0]
    local_prot = pid_cai.iloc[local_prot_indexes].reset_index(drop=True)
    # let's also check our t.o. score