def do_work(idx):
     # IMPORTANT!!! 
     np.random.seed()
     #
     cds_dat = orgs.get_group(idx)
     #
     genetic_table = cds_dat['table'].iloc[0]
     genetic_code = Data.CodonTable.unambiguous_dna_by_id[genetic_table]
     SynonymousCodons = dict([(aa,[]) for aa in genetic_code.protein_alphabet.letters])
     # SynonymousCodons['STOP'] = genetic_code.stop_codons # STOP codons are excluded from analysis ...
     for codon,aa in genetic_code.forward_table.iteritems():
         SynonymousCodons[aa].append(codon)
     #
     #
     # prot_cds_rnd = cds_dat['cDNA'].sample(PROT_COUNT) # cDNA sample proteins ...
     prot_cds_rnd = cds_dat['cDNA'] # let's use ALL cDNA to get the codon bias ...
     codon_usage = cairi.count_codons(prot_cds_rnd)
     #
     # generate codon weights based on codon counts ...
     codon_weights = {}
     for aa in SynonymousCodons:
         aa_codon_usage = [ codon_usage[codon] for codon in SynonymousCodons[aa] ]
         total_codons = sum(aa_codon_usage)
         # normalize ...
         codon_weights[aa] = np.true_divide(aa_codon_usage,float(total_codons))
     #
     # now rewrite (back translate) protein sequences keeping their index ...
     cdna_shuffled =  ( (ix,pid,back_translate(protein,SynonymousCodons,codon_weights))  for ix,pid,protein in cds_dat[['pid','protein']].itertuples() )
     cdna_shuffled = pd.DataFrame(cdna_shuffled,columns=['ix','pid','cDNA_rnd'])
     cdna_shuffled = cdna_shuffled.set_index(keys='ix')
     #
     #
     # shuffled_cdna_list.append( cdna_shuffled )
     return cdna_shuffled
Example #2
0
 def do_work(idx):
     # IMPORTANT!!!
     np.random.seed()
     #
     cds_dat = orgs.get_group(idx)
     #
     genetic_table = cds_dat['table'].iloc[0]
     genetic_code = Data.CodonTable.unambiguous_dna_by_id[genetic_table]
     SynonymousCodons = dict([
         (aa, []) for aa in genetic_code.protein_alphabet.letters
     ])
     # SynonymousCodons['STOP'] = genetic_code.stop_codons # STOP codons are excluded from analysis ...
     for codon, aa in genetic_code.forward_table.iteritems():
         SynonymousCodons[aa].append(codon)
     #
     #
     # prot_cds_rnd = cds_dat['cDNA'].sample(PROT_COUNT) # cDNA sample proteins ...
     prot_cds_rnd = cds_dat[
         'cDNA']  # let's use ALL cDNA to get the codon bias ...
     codon_usage = cairi.count_codons(prot_cds_rnd)
     #
     # generate codon weights based on codon counts ...
     codon_weights = {}
     for aa in SynonymousCodons:
         aa_codon_usage = [
             codon_usage[codon] for codon in SynonymousCodons[aa]
         ]
         total_codons = sum(aa_codon_usage)
         # normalize ...
         codon_weights[aa] = np.true_divide(aa_codon_usage,
                                            float(total_codons))
     #
     # now rewrite (back translate) protein sequences keeping their index ...
     cdna_shuffled = (
         (ix, pid, back_translate(protein, SynonymousCodons, codon_weights))
         for ix, pid, protein in cds_dat[['pid', 'protein']].itertuples())
     cdna_shuffled = pd.DataFrame(cdna_shuffled,
                                  columns=['ix', 'pid', 'cDNA_rnd'])
     cdna_shuffled = cdna_shuffled.set_index(keys='ix')
     #
     #
     # shuffled_cdna_list.append( cdna_shuffled )
     return cdna_shuffled
ribo_counts = [(idx, orgs.get_group(idx)["ribosomal"].nonzero()[0].size) for idx in genom_id]
ribo_cai_info = pd.DataFrame(ribo_counts, columns=["GenomicID", "ribo_count"])

#############
#
#
cix_prot = {}
cix_ribo = {}
#
#
#############
for idx, ribo_count in ribo_cai_info.itertuples(index=False):
    #
    cds_dat = orgs.get_group(idx)
    prot_cds_rnd = cds_dat["cDNA"].sample(PROT_COUNT)  # cDNA sample proteins ...
    codon_usage_rnd = cairi.count_codons(prot_cds_rnd)
    codon_index_rnd = cairi.generate_codon_index(
        codon_usage_rnd, genetic_table=list(cds_dat["table"])[0]
    )  # fix that ...
    cix_prot[idx] = codon_index_rnd
    #
    if ribo_count >= RIBO_LIMIT:
        pass
        ribo_cds = cds_dat[cds_dat["ribosomal"]]["cDNA"]  # cDNA of ribosomal proteins ...
        codon_usage = cairi.count_codons(ribo_cds)
        codon_index = cairi.generate_codon_index(codon_usage, genetic_table=list(cds_dat["table"])[0])  # fix that ...
        cix_ribo[idx] = codon_index

######################

Example #4
0
ribo_cai_info = pd.DataFrame(ribo_counts,columns=['assembly_accession','ribo_count'])

# some lists to describe organism's CAI distribution features ...
percentile = []
median = []
mean = []
sigma = []
idx_for_ribo = []
ribo_count_for_df = [] 
#
pid_cai_list = []
for idx,ribo_count in ribo_cai_info.itertuples(index=False):
    if ribo_count >= RIBO_LIMIT:
        cds_dat = orgs.get_group(idx)
        ribo_cds = cds_dat[cds_dat['ribosomal']]['cDNA'] # cDNA of ribosomal proteins ...
        codon_usage = cairi.count_codons(ribo_cds)
        codon_index = cairi.generate_codon_index(codon_usage,genetic_table=list(cds_dat['table'])[0]) # fix that ...
        # we need to track index from 'dat', as there are some stupid duplications ...
        pid_cai = pd.DataFrame(((dat_idx,pid,cairi.cai_for_gene(sequence,codon_index)) for dat_idx,pid,sequence in cds_dat[['pid','cDNA']].itertuples()),columns=['dat_idx','pid','CAI'])
        pid_cai = pid_cai.set_index(keys='dat_idx')
        # characterize CAI distribution for a given organism ...
        local_mean = pid_cai['CAI'].mean()
        local_median = pid_cai['CAI'].median()
        local_sigma = pid_cai['CAI'].std()
        mean.append(local_mean)
        median.append(local_median)
        sigma.append(local_sigma)
        idx_for_ribo.append(idx)
        ribo_count_for_df.append(ribo_count)
        #
        local_ribo_indexes = cds_dat['ribosomal'].nonzero()[0]
Example #5
0
genom_id = orgs.groups.keys()

# some lists to describe organism's CAI distribution features ...
percentile = []
median = []
mean = []
sigma = []
idx_for_prot = []
#
pid_cai_list = []
for idx in genom_id:
    cds_dat = orgs.get_group(idx)
    # instead of taking specific group of proteins, let's take RANDOM sample of 50 cDNA ...
    prot_cds = cds_dat['cDNA'].sample(PROT_COUNT)  # cDNA sample proteins ...
    codon_usage = cairi.count_codons(prot_cds)
    codon_index = cairi.generate_codon_index(
        codon_usage, genetic_table=list(cds_dat['table'])[0])  # fix that ...
    # we need to track index from 'dat', as there are some stupid duplications ...
    pid_cai = pd.DataFrame(
        ((dat_idx, pid, cairi.cai_for_gene(sequence, codon_index))
         for dat_idx, pid, sequence in cds_dat[['pid', 'cDNA']].itertuples()),
        columns=['dat_idx', 'pid', 'CAI'])
    pid_cai = pid_cai.set_index(keys='dat_idx')
    # characterize CAI distribution for a given organism ...
    local_mean = pid_cai['CAI'].mean()
    local_median = pid_cai['CAI'].median()
    local_sigma = pid_cai['CAI'].std()
    mean.append(local_mean)
    median.append(local_median)
    sigma.append(local_sigma)
mean = []
sigma = []
idx_for_prot = []
#
pid_cai_list = []
for idx in genom_id:
    # cds_dat = orgs.get_group(idx) # old stuff ...
    org_cai = cai.get_group(idx)
    condition = org_cai['CAI'].notnull().all()
    if not condition:
        print "skipping", idx, " as it has no CAI ..."
    else:
        q30 = org_cai['CAI'].quantile(q=0.30)
        # take 50 random proteins from the bottom 30% OF ORIGINAL CAI ...
        prot_cds = org_cai[org_cai.CAI<q30]['cDNA'].sample(PROT_COUNT)
        codon_usage = cairi.count_codons(prot_cds)
        codon_index = cairi.generate_codon_index(codon_usage,genetic_table=list(org_cai['table'])[0]) # fix that ...
        # we need to track index from 'dat', as there are some stupid duplications ...
        pid_cai = pd.DataFrame(((dat_idx,pid,cairi.cai_for_gene(sequence,codon_index)) for dat_idx,pid,sequence in org_cai[['pid','cDNA']].itertuples()),columns=['dat_idx','pid','CAI'])
        pid_cai = pid_cai.set_index(keys='dat_idx')
        # characterize CAI distribution for a given organism ...
        local_mean = pid_cai['CAI'].mean()
        local_median = pid_cai['CAI'].median()
        local_sigma = pid_cai['CAI'].std()
        mean.append(local_mean)
        median.append(local_median)
        sigma.append(local_sigma)
        idx_for_prot.append(idx)
        #
        # get relative indexes of sampled cDNAs in 'org_cai' ...
        local_prot_indexes = prot_cds.index - org_cai.index[0]
ribo_cai_info = pd.DataFrame(ribo_counts, columns=['GenomicID', 'ribo_count'])

#############
#
#
cix_prot = {}
cix_ribo = {}
#
#
#############
for idx, ribo_count in ribo_cai_info.itertuples(index=False):
    #
    cds_dat = orgs.get_group(idx)
    prot_cds_rnd = cds_dat['cDNA'].sample(
        PROT_COUNT)  # cDNA sample proteins ...
    codon_usage_rnd = cairi.count_codons(prot_cds_rnd)
    codon_index_rnd = cairi.generate_codon_index(
        codon_usage_rnd,
        genetic_table=list(cds_dat['table'])[0])  # fix that ...
    cix_prot[idx] = codon_index_rnd
    #
    if ribo_count >= RIBO_LIMIT:
        pass
        ribo_cds = cds_dat[cds_dat['ribosomal']][
            'cDNA']  # cDNA of ribosomal proteins ...
        codon_usage = cairi.count_codons(ribo_cds)
        codon_index = cairi.generate_codon_index(
            codon_usage,
            genetic_table=list(cds_dat['table'])[0])  # fix that ...
        cix_ribo[idx] = codon_index