예제 #1
0
def plot_opt(seq, ref_table = 'Scer', plot_par = 'decoding.time',window=25,plot_colors = ['gold','cornflowerblue','mediumpurple']):

    #convert sequence to DNA
    seq = seq.upper()
    seq = seq.replace('U','T')

    parameterset = load_from_Data(ref_table)

    #generate a lookup dictionary for the plotted parameter
    pardict = {}
    for c in parameterset['codon']:
        this_par = parameterset.loc[parameterset['codon'] == c][plot_par].values[0]
        this_c= c.replace('U','T')
        pardict[this_c] = this_par
    #make the extreme arameter sequences
    aaseq = translate(seq)
    max_seq = opt_seq(aaseq,ref_table = ref_table,optimise_by = [plot_par,max],diversify = [])
    min_seq = opt_seq(aaseq,ref_table = ref_table,optimise_by = [plot_par,min], diversify = [])

    codon_seq = [seq[i:i+3] for i in range(0,len(seq),3) if len(seq[i:i+3]) == 3]
    max_codon_seq = [max_seq[i:i+3] for i in range(0,len(max_seq),3) if len(max_seq[i:i+3]) == 3]
    min_codon_seq = [min_seq[i:i+3] for i in range(0,len(min_seq),3) if len(min_seq[i:i+3]) == 3]


    #prepare vectors of the codon parameter for each sequence:
    seq_pars = [pardict[c] for c in codon_seq]
    max_pars = [pardict[c] for c in max_codon_seq]
    min_pars = [pardict[c] for c in min_codon_seq]
    x = list(range(len(seq_pars)))

    smooth_seq_pars = [sum(seq_pars[i:i+window])/window for i in range(len(seq_pars) - window)]
    smooth_max_pars = [sum(max_pars[i:i+window])/window for i in range(len(max_pars) - window)]
    smooth_min_pars = [sum(min_pars[i:i+window])/window for i in range(len(min_pars) - window)]

    smooth_x_offset = int(window/2)
    smooth_x = list(range(smooth_x_offset,smooth_x_offset + len(smooth_min_pars)))
    #prepare the plot
    fig,ax = plt.subplots()
    ax.scatter(x, seq_pars,c=plot_colors[0],s=5,alpha=0.5)
    ax.plot(smooth_x,smooth_seq_pars,c=plot_colors[0],label='actual')
    ax.plot(smooth_x,smooth_max_pars,c=plot_colors[1],label='max')
    ax.plot(smooth_x,smooth_min_pars,c=plot_colors[2],label='min')

    ax.set_xlabel('Codon No')
    ax.set_ylabel(plot_par)

    plt.legend(loc='upper right')

    return fig,ax
예제 #2
0
def slow_down_seq(seq, codon_range=[], by=2, ref_table='Scer'):

    from CodOpY.analyse import time_seq, translate
    from CodOpY.misc import codon_choices

    #load auxiliary package data
    ref = load_from_Data(ref_table)
    time_dict_by_codon = dict(zip(ref['codon'], ref['decoding.time']))

    #add pseudo data for stop codons
    time_dict_by_codon['TGA'] = 0
    time_dict_by_codon['TAA'] = 0
    time_dict_by_codon['TAG'] = 0
    codon_seq = [seq[n:n + 3] for n in range(0, len(seq), 3)]
    if len(codon_range) == 2:
        codon_subseq = codon_seq[codon_range[0]:codon_range[1] + 1]
    else:
        codon_subseq = codon_seq
    #determine existing and new target times for subseq
    is_time = time_seq(''.join(codon_subseq),
                       ref_table=ref_table)['Average decoding time per codon']
    target_time = is_time * by
    #assemble a new sequence that is as close as possible to the new target time
    replace_codon_seq = []
    for codon in codon_subseq:
        aa = translate(codon)
        choice = codon_choices(aa)
        diffs = [
            abs(target_time - val) for val in list(choice['decoding.time'])
        ]
        replace_codon_seq.append(choice.iloc[diffs.index(min(diffs))]['codon'])
    if len(codon_range) == 2:
        return_codon_seq = codon_seq[
            0:codon_range[0]] + replace_codon_seq + codon_seq[codon_range[1] +
                                                              1:]
    else:
        return_codon_seq = replace_codon_seq
    return ''.join(return_codon_seq).replace('U', 'T')
예제 #3
0
def time_seq(input_seq,ref_table='Scer'):
    '''time_seq calculates the time it takes to decode a DNA or RNA sequence.

    Parameters
    ----------
    input_seq : str
        The DNA or RNA sequence for which the tiem properties are beiing returned.
    ref_table : str
        The name of the reference table to be used, eg 'Scer' for S. cerevisiae

    Returns
    -------
    dict
        A dictionary containing the overall Decoding time in seconds,
        the Average decoding time per codon in seconds, and the CV
        (coefficient of variation of the decoding time per codon)
    '''

    #convert the input sequence to DNA if RNA
    input_seq = input_seq.replace('U','T')
    #make a look-up dictionary of the decoding times available for an amino acid
    #from the reference table
    ref = load_from_Data(ref_table)
    ref['codon'] = ref['codon'].str.replace('U','T')
    time_dict_by_codon = dict(zip(ref['codon'], ref['decoding.time']))
    #add pseudo data for stop codons
    time_dict_by_codon['TGA'] = 0
    time_dict_by_codon['TAA'] = 0
    time_dict_by_codon['TAG'] = 0
    #calculate the decoding time properties of the sequence
    codon_seq = [input_seq[n:n+3] for n in range(0,len(input_seq),3)]
    times_vec = [time_dict_by_codon[codon] for codon in codon_seq]
    results = {}
    results['Decoding time'] = np.sum(times_vec)
    results['Average decoding time per codon'] = results['Decoding time'] / len(codon_seq)
    results['CV'] = np.std(times_vec, ddof=1) / np.mean(times_vec) * 100
    return results
예제 #4
0
def test_RE(RE, test_seq):
    '''
    Tests whether an RE site is present in a sequence.

    Parameters
    ==========
    REs : str
        The name of the enzyme, or the sequence, to be tested.

    test_seq : str
        The DNA sequence to be tested.

    Returns
    =======
    list
        A list of names of those enzymes for which sites were found in
        test_seq, or an empty list of none of the enzyme sites was found.
    '''

    RE_ref = load_from_Data('RE_List')
    
    #convert RE names to upper case for comparison to 'RE' variable
    RE_ref.Name = RE_ref.Name.str.upper()

    #is site a restriction enzyme name?
    RE = RE.upper()
    if RE in RE_ref.Name.values:
        RE_seq = RE_ref.loc[RE_ref['Name'] == RE]['Motif'].values[0]
    #else is site a valid DNA sequence?
    elif not 0 in [c in ['A','C','T','G','W','S','M','K','R','Y','N'] for c in RE]:
        RE_seq = RE
    else:
        print('No known Restriction enzyme site or valid DNA sequence specified.')
        return
    #remove leading or trailing Ns from RE site
    while RE_seq[0] == 'N':
        RE_seq = RE_site[1:]
    while RE_seq[-1] == 'N':
        RE_seq = RE_site[:-1]
    #does RE_site now have more than 5 'N' (this becomes very inefficient)
    if RE_seq.count('N') > 5:
        print('Too many N - the maximum number of N allowed in the RE sequence is 5')
        return

    #does the RE site contain ambiguous nucleotide symbols?
    #If so list all corresponding unambiguous sequences
    ambiguous_bases = {'W':['A','T'],'S':['G','C'],'M':['A','C'],'K':['G','T'],'R':['A','G'],'Y':['C','T'],
                       'B':['C','G','T'],'D':['A','G','T'],'H':['A','C','T'],'V':['A','C','G'],'N':['A','C','G','T']}
    unamb_RE_seqs = [RE_seq]
    while 0 in [c in ['A','C','T','G'] for c in unamb_RE_seqs[0]]:
        new_options = []
        for seq in unamb_RE_seqs:
            for idx, nt in enumerate(seq):
                if nt in ambiguous_bases.keys():
                    for nt in ambiguous_bases[nt]:
                        new_options.append(seq[:idx] + nt + seq[idx+1:])
                    break
        unamb_RE_seqs = new_options


    #convert sequence to valid uppercase DNA sequence
    test_seq = test_seq.upper().replace('U','T')

    #is site in seq?
    if not 1 in [R in test_seq for R in unamb_RE_seqs]:
        print(RE + ' not found in this sequence. \n')
        return
    #if yes go through the sequence and record all sites of occurrence
    else:
        found_sites = []
        sub_seq = test_seq
        while 1 in [R in sub_seq for R in unamb_RE_seqs]:
            for R in unamb_RE_seqs:
                if R in sub_seq:
                    if len(found_sites) >= 1:
                        index_shift = found_sites[-1]
                    else:
                        index_shift = 0
                    found_sites.append(sub_seq.find(R)+ index_shift)
                    sub_seq = sub_seq[found_sites[-1]+1:]
        print(RE + ' found at the following site(s): ' + str(found_sites).replace('[','').replace(']','') + '\n')
        return
예제 #5
0
def opt_seq(seq,
            diversify=[],
            diversify_range=0.2,
            ref_table='Scer',
            enforce={},
            optimise_by=['decoding.time', min]):
    '''
    Makes an optimised DNA sequence corresponding to an input amino
    acid sequence.

    Parameters
    ==========
    seq : str
        The amino aid sequence to be otimised

    diversify : list of str
        A list specifying individual amino acids for which codons
        should be diversified.

    diversify_range : float
        The proportion over which the optimisation parameters that is
        allowed to vary for diversified amno acids, relative to 1.

    ref_table : str
        the name of the data file containing the codon data.
        
    enforce : dict
        A dictionary for manually specifying codons for certain amino 
        acids (for these the optimisation parameters are overridden).
        Example: enforce = {'K':'AAG'} will always use AAG to encode 
        lysine (K).

    optimise_by : list of str and function
        The str part of optimise_by specifies which cloumn of the 
        ref_table shold be used for optimisation. Function can be min
        or max and specifies whether the highest or lowest value should
        be selected for optimisation.

    Returns
    =======
    str
        Returns a DNA sequence string.
    '''

    parameterset = load_from_Data(ref_table)

    #define the reverse translation dictionary:
    #which codons should be considered for which amino acid?
    #if an amino acid is not in the diversify list, use the fastest codon
    #if an amino acid is in the diversify list, diversify codon choice by
    #random draw from all codons for which the diversify_range threshold
    #applies
    reverse_dict = {}
    for aa in parameterset['one.letter'].unique():
        codons_for_aa = parameterset.loc[parameterset['one.letter'] == aa]
        if aa in diversify:
            if optimise_by[1] == min:
                diversify_threshold = min(
                    codons_for_aa[optimise_by[0]]) * (1 + diversify_range)
                acceptable_codons_for_aa = list(
                    codons_for_aa.loc[codons_for_aa['decoding.time'] <
                                      diversify_threshold]['codon'])
            elif optimise_by[1] == max:
                diversify_threshold = max(
                    codons_for_aa[optimise_by[0]]) * (1 - diversify_range)
                acceptable_codons_for_aa = list(
                    codons_for_aa.loc[codons_for_aa['decoding.time'] >
                                      diversify_threshold]['codon'])
            else:
                raise ValueError("Invalid amino acid specified in diversify")
        else:
            best_decoding_time_for_aa = optimise_by[1](
                codons_for_aa['decoding.time'])
            acceptable_codons_for_aa = list(
                codons_for_aa.loc[codons_for_aa['decoding.time'] ==
                                  best_decoding_time_for_aa]['codon'])
        reverse_dict[aa] = acceptable_codons_for_aa
    #have codons been specified using the 'enforce' parameter?
    if len(enforce) > 0:
        for key, value in enforce.items():
            codon_set = list(parameterset.loc[parameterset['one.letter'] ==
                                              key]['codon'].values)
            if value not in codon_set:
                print('Non-standard genetic code enforced!')
            print('Codon for ' + key + ' enforced as ' + value)
            if type(value) != list:
                value = [value]
            reverse_dict[key] = value
    codon_seq = []
    for seq_aa in seq:
        codon_seq.append(random.choice(reverse_dict[seq_aa]))
    return ''.join(codon_seq).replace('U', 'T')
예제 #6
0
def remove_RE(site,
              test_seq,
              ref_table='Scer',
              optimise_by=['decoding.time', min],
              suppress_not_found=False):
    '''Removes restriction enzyme sites from DNA sequences without altering the encoded
    amino acid sequence and while maintaining codon optimisation as much as possible.

    Parameters
    ==========
    site : str
        the name of the restriction enzyme for which sites should be removed.

    test_seq : str
        the sequence from which sites are to be removed.

    ref_table : str
        the name of the reference table from which the optimisation information
        is being used.

    optimise_by : list of str and Function
        As for opt_seq, the name of the column of ref_table from which
        optimisation info is generated, and whether optimal is the minimum or
        maximum.

    Returns
    =======
    str
        A DNA sequence string.
    '''

    #load auxiliary package data
    RE_ref = load_from_Data('RE_List')
    codons = load_from_Data(ref_table)

    #convert RE names to upper case for comparison to 'RE' variable
    RE_ref.Name = RE_ref.Name.str.upper()

    #is site a restrictions enzyme name?
    site = site.upper()
    if site in RE_ref.Name.values:
        RE_seq = RE_ref.loc[RE_ref['Name'] == site]['Motif'].values[0]
    #is site a valid DNA sequence?
    elif not 0 in [
            c in ['A', 'C', 'T', 'G', 'W', 'S', 'M', 'K', 'R', 'Y', 'N']
            for c in site
    ]:
        RE_seq = site
    else:
        print(
            'No known Restriction enzyme site or valid DNA sequence specified.'
        )
        return
    #remove leading or trailing Ns from RE site
    while RE_seq[0] == 'N':
        RE_seq = RE_site[1:]
    while RE_seq[-1] == 'N':
        RE_seq = RE_site[:-1]
    #does RE_site now have more than 5 'N' (this becomes very inefficient)
    if RE_seq.count('N') > 5:
        print(
            'Too many N - the maximum number of N allowed in the RE sequence is 5'
        )
        return

    #does the RE site contain ambiguous nucleotide symbols?
    #If so list all corresponding unambiguous sequences
    ambiguous_bases = {
        'W': ['A', 'T'],
        'S': ['G', 'C'],
        'M': ['A', 'C'],
        'K': ['G', 'T'],
        'R': ['A', 'G'],
        'Y': ['C', 'T'],
        'B': ['C', 'G', 'T'],
        'D': ['A', 'G', 'T'],
        'H': ['A', 'C', 'T'],
        'V': ['A', 'C', 'G'],
        'N': ['A', 'C', 'G', 'T']
    }
    unamb_RE_seqs = [RE_seq]
    while 0 in [c in ['A', 'C', 'T', 'G'] for c in unamb_RE_seqs[0]]:
        new_options = []
        for seq in unamb_RE_seqs:
            for idx, nt in enumerate(seq):
                if nt in ambiguous_bases.keys():
                    for nt in ambiguous_bases[nt]:
                        new_options.append(seq[:idx] + nt + seq[idx + 1:])
                    break
        unamb_RE_seqs = new_options

    #convert sequence to valid uppercase DNA sequence
    test_seq = test_seq.upper().replace('U', 'T')

    #is site in seq?
    if not 1 in [R in test_seq for R in unamb_RE_seqs]:
        if not suppress_not_found:
            print(site + ' not found in this sequence')
        return
    new_seq = test_seq

    #prepare auxiliary data structures
    codon_pos = list(range(0, len(test_seq) - 2, 3))
    codons['codon'] = codons['codon'].str.replace('U', 'T')
    code_lookup = pd.Series(codons['one.letter'].values,
                            index=codons.codon).to_dict()
    speed_lookup = pd.Series(codons[optimise_by[0]].values,
                             index=codons.codon).to_dict()
    reverse_code_lookup = {}
    for aa in codons['one.letter'].unique():
        this_subset = codons.loc[codons['one.letter'] == aa]
        reverse_code_lookup[aa] = list(this_subset['codon'])

    #go through each RE site and replace one codon to remove it
    while 1 in [R in new_seq for R in unamb_RE_seqs]:
        #determine the starting nt of the first instance of the RE_site
        for this_RE_seq in unamb_RE_seqs:
            try:
                found_RE_index = new_seq.index(this_RE_seq)
                break
            except:
                pass
        #isolate the sequence of codons that contains the site
        subseq_start = max([x for x in codon_pos if found_RE_index >= x])
        if (found_RE_index + len(unamb_RE_seqs[0])) > len(new_seq) - 2:
            subseq_stop = len(new_seq)
        else:
            subseq_stop = min([
                x for x in codon_pos
                if (found_RE_index + len(unamb_RE_seqs[0]) <= x)
            ])
        subseq_contains_site = new_seq[subseq_start:subseq_stop]
        subseq_codons = [
            subseq_contains_site[n:n + 3]
            for n in range(0, len(subseq_contains_site), 3)
        ]

        seq_vec, times_vec = [], []
        for idx, sub_codon in enumerate(subseq_codons):
            this_aa = code_lookup[sub_codon]
            for alternative in reverse_code_lookup[this_aa]:
                if alternative != sub_codon:
                    new_subseq_codons = subseq_codons[:idx] + [
                        alternative
                    ] + subseq_codons[idx + 1:]
                    new_subseq = ''.join(new_subseq_codons)
                    if not 1 in [R in new_subseq for R in unamb_RE_seqs]:
                        seq_vec.append(new_subseq)
                        time = 0
                        for codon in new_subseq_codons:
                            time += speed_lookup[codon]
                        times_vec.append(time)
        best_option_index = times_vec.index(optimise_by[1](times_vec))
        new_subseq = seq_vec[best_option_index]
        new_seq = new_seq[:subseq_start] + new_subseq + new_seq[subseq_stop:]

    return new_seq