Beispiel #1
0
    def find_hairpins_windowed(self,
                               Hwindow=[50, 100, 50],
                               HL=[11, 17, 21],
                               HGC=0.8,
                               HMFE=[-10, -15, -20],
                               pal=11):

        self.h_count = 0
        self.h_loc_list = []
        self.h_info = {}
        self.hairpins = {}

        self.DNAmodel = PyVRNA(parameter_file="dna_mathews2004.par",
                               dangles=0,
                               noGU=True)  # initialize PyVRNA model object

        for j in range(self.size - 100):
            seq1 = self.sequence1[j:j + 100]
            seq2 = self.sequence2[self.size - (j + 100):self.size - j]
            for i, h in enumerate(Hwindow):
                lengths1, details1 = self.get_hairpin_lengths(seq1, h)
                lengths2, details2 = self.get_hairpin_lengths(seq2, h)
                self.hairpin_processing(j, lengths1, details1, HL[i], HGC,
                                        HMFE[i])
                self.hairpin_processing(j,
                                        lengths2,
                                        details2,
                                        HL[i],
                                        HGC,
                                        HMFE[i],
                                        rc=True)

        self.energies, self.max_bp_hairpin = self.get_windowed_DNA_structure_info(
            50)
        self.fast_hairpins(fast=False)
# sys.path.append('/usr/local/lib/python2.7/site-packages/')

import synbiomts
import cPickle as pickle
# import TranslationRateModels as tl

# We're going to use shelve to store model predictions
# as a dictionary-like persistance object of pandas dataframes
import shelve

# Import private Salis Lab code (latest versions of RBS Calculator)
sys.path.append('/home/alex/Private-Code')
from DNAc import *

from PyVRNA import PyVRNA
RNAEnergyModel = PyVRNA(dangles=0)

handle = open('models/rRNA_16S_3p_ends.p', 'r')
rRNA_16S_3p_ends = pickle.load(handle)
handle.close()


# Required function in RBS Calculators and UTR Designer
def get_rRNA(organism):
    # temporary until I update database:
    if organism == "Corynebacterium glutamicum B-2784":
        organism = 'Corynebacterium glutamicum R'

    elif organism == "Pseudomonas fluorescens A506":

        return 'ACCTCCTTT'
Beispiel #3
0
class seq_assess(object):
    def __init__(self, sequence=None, begin=0, verbose=None):
        # defaults
        self.rc = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
        self.bad_seqs = []
        self.score = [0, 0, 0, 0]
        self.comp_table = maketrans('ATGC', 'TACG')
        self.verbose = verbose
        self.begin = begin
        if sequence:
            self.load(sequence)

    def load(self, sequence):
        #input validation
        try:
            sequence1 = str(sequence.seq).upper()
            sequence2 = str(sequence.seq.reverse_complement()).upper()
        except:
            try:
                sequence1 = str(sequence).upper()
                sequence2 = str(sequence.reverse_complement()).upper()
            except:
                try:
                    sequence1 = str(sequence)
                    sequence1 = sequence1.upper()
                    #print sequence1
                    bases = list(sequence1)
                    bases = reversed(
                        [self.rc.get(base, base) for base in bases])
                    sequence2 = ''.join(bases)
                    #print sequence2
                except:
                    print "The sequence provided was neither a Biopython record, Biopython Seq class, nor a valid string with your DNA sequence. Check what you are passing to seq_assess."
                    self.bad_seqs.append(
                        "The sequence provided was neither a Biopython record, Biopython Seq class, nor a valid string with your DNA sequence. Check what you are passing to seq_assess."
                    )
        # setting up internal sequence
        self.size = len(sequence)
        self.sequence = sequence1
        self.sequence1 = sequence1
        self.sequence2 = sequence2

    def run(self):
        ### quickmode for running as an evaluate method

        self.find_repeats(RL=9, Rmax=11)
        self.find_hairpins_windowed()
        self.find_nucleotides()
        if self.verbose:
            for i in self.bad_seqs:
                print i

        self.s_count = self.r_count + self.h_count + self.n_count
        self.s_loc_list = self.r_loc_list + self.h_loc_list + self.n_loc_list

        self.s_info = {}
        for info in [self.r_info, self.h_info, self.n_info]:
            for key in info.keys():

                try:
                    x = len(
                        info[key]["locations"][0]
                    )  #this should be a tuple and have length, but sometimes it's not!
                except:
                    print info[key]

                try:
                    for location in info[key]["locations"]:
                        if location[0] > location[1]:
                            print "location is ", location
                            print "The begin position must be less than the end position."
                            print info[key]
                except:
                    print "Error"
                    print info[key]

                if key not in self.s_info:
                    self.s_info[key] = info[key]
                else:
                    self.s_info[key]["locations"] += info[key]["locations"]
                    self.s_info[key]["types"] += info[key]["types"]

    def run_ELSA_analysis(self):
        ## quickmode for running as an evaluate method

        self.find_repeats(RL=9, Rmax=11)
        self.fast_hairpins()
        self.find_nucleotides()

        self.s_count = self.r_count + self.h_count + self.n_count
        self.s_loc_list = self.r_loc_list + self.h_loc_list + self.n_loc_list

        self.s_info = {}
        for info in [self.r_info, self.h_info, self.n_info]:
            for key in info.keys():

                try:
                    x = len(
                        info[key]["locations"][0]
                    )  #this should be a tuple and have length, but sometimes it's not!
                except:
                    print info[key]

                try:
                    for location in info[key]["locations"]:
                        if location[0] > location[1]:
                            print "location is ", location
                            print "The begin position must be less than the end position."
                            print info[key]
                except:
                    print "Error"
                    print info[key]

                if key not in self.s_info:
                    self.s_info[key] = info[key]
                else:
                    self.s_info[key]["locations"] += info[key]["locations"]
                    self.s_info[key]["types"] += info[key]["types"]

    def find_repeats(self,
                     RL=8,
                     Rmax=20,
                     Rthreshold=[0.69, 0.40, 0.90, 0.60],
                     Rwindow=[70, 500, 60],
                     Rtandem=5):
        ### code for testing the following six rules:
        # A) Any sequence that contains more that 69 % of  bases that are part of repeats 8 bases or longer *   "X= 69 Y=8" 10
        # B) Any single repeated sequence contains more that 40 % of  bases that are part of repeats 8 bases or longer *    "X=40 Y=8"  6
        # C) Any sequence that contains more that 90 % of  bases that are part of repeats 8 bases or longer within a window of 70 length *  "X=90 Y=8 Z=70" 10
        # D) Any sequence that contains more that 60 % of  bases that are part of repeats 8 bases or longer within a window of 500 length for sequences longer than 1000 *  "X=60 Y=8 Z=500 L=1000" 10
        # E) a tandem repeat of length 5 or greater beginning or ending at either termini of the sequence   X=5 10
        # F) repeats more than 12 bp long risk impede synthesis

        ## outputs for Operon Calculator
        self.r_count = 0
        self.r_loc_list = []
        self.r_info = {}

        ## FastFinder, courtesy of Ayaan Hossain
        RP = FastFinder()

        self.RepeatDict = RP.get_repeat_dict([self.sequence1],
                                             RL,
                                             verbose=False)

        total = np.zeros(self.size, dtype=bool)
        total2 = np.zeros(self.size)
        r_distribution = {
        }  # for gathering distributions of repeats for visualizations

        location = {}

        for i in self.RepeatDict.keys():
            self.r_info[i] = {"locations": [], "types": []}
            val = np.zeros(self.size, dtype=bool)

            if len(i) > Rmax:  # F)
                self.bad_seqs.append("This repeat is too long: %s" % (i))

            for j in self.RepeatDict[i].keys():
                location_list = list(self.RepeatDict[i][j][0])
                self.r_info[i]["types"].extend([j])
                self.r_info[i]["locations"].extend([(self.begin + k,
                                                     self.begin + k + len(i))
                                                    for k in location_list])

                for k in location_list:
                    if len(i) > Rmax:
                        self.score[0] += len(i)

                    self.r_count += 1
                    self.r_loc_list += [((self.begin + k,
                                          self.begin + k + len(i)), 1.0)]

                    for v in range(k, k + len(i)):
                        val[v] = True

                    if len(i) not in r_distribution:
                        #for l in range(RL,len(i)):
                        #    if l not in r_distribution:
                        #        r_distribution[l] = 0
                        r_distribution[len(i)] = 1
                    else:
                        r_distribution[len(i)] += 1

            ind_fract = float(sum(val.astype(int))) / self.size
            if ind_fract > Rthreshold[1]:  # B)
                self.bad_seqs.append(
                    "Too much of this repeat: %s. Redesign to remove " % (i))
                self.score[0] += 6

            total += val
            total2 += val.astype(int)

        total_fract = float(sum(total.astype(int))) / self.size
        if total_fract > Rthreshold[0]:  # A)
            self.bad_seqs.append("Too many repeats. Redesign to remove ")
            self.score[0] += 10

        windowed_repeats = []  # for more visuals
        for i in range(len(Rwindow) - 1):  # C) & D)

            for j in range(self.size - Rwindow[i]):
                peek = float(sum(total[j:j + Rwindow[i]].astype(int)))

                if peek / Rwindow[i] > Rthreshold[2 + i]:
                    #self.bad_seqs.append("%i bp window starting at %i is too repeat rich. Redesign to remove "%(Rwindow[i],j))
                    self.score[0] += 10
                if i == 0:
                    windowed_repeats.append(peek / Rwindow[i])
        # E)
        term1 = self.sequence1[:Rwindow[2]]
        self.tandem_subroutine(term1, Rtandem, "five")

        term2 = self.sequence1[len(self.sequence1) - Rwindow[2]:]
        self.tandem_subroutine(term2, Rtandem, "three")

        self.repeat_density = windowed_repeats
        rep_list = []
        for k, v in r_distribution.items():
            rep_list.append((k, v))
        self.repeat_distribution = rep_list

    def tandem_subroutine(self, term, Rtandem, end):
        # subroutine that finds tandem repeats using regex
        i = int(Rtandem)
        L = int(Rtandem)

        while i < len(term):
            seq = term[(i - L):i]
            run = re.finditer(r"%s%s" % (seq, seq), term)
            m = [[n.start(), n.end()] for n in run]

            if len(m) > 0:
                tandrep = ''.join([seq, seq])
                self.r_count += 1
                self.r_info[tandrep] = {"locations": [], "types": []}

                if end == "five":
                    self.r_info[tandrep]["locations"].append(
                        (self.begin + i, self.begin + i + len(tandrep))
                    )  #Adding offset (begin) if sequence is a subsequence
                    self.r_info[tandrep]["types"].append("tandem")
                    self.r_loc_list += [
                        ((self.begin + i, self.begin + i + len(tandrep)), 1.0)
                    ]  #Adding offset (begin) if sequence is a subsequence
                    self.bad_seqs.append(
                        "tandem repeat too close to 5' end: %s" % (tandrep))

                elif end == "three":
                    self.r_info[tandrep]["locations"].append(
                        (self.begin + self.size - 60 + i - L,
                         self.begin + self.size - 60 + i - L + len(tandrep))
                    )  #Adding offset (begin) if sequence is a subsequence
                    self.r_info[tandrep]["types"].append("tandem")
                    self.r_loc_list += [
                        ((self.begin + self.size - 60 + i - L,
                          self.begin + self.size - 60 + i - L + len(tandrep)),
                         1.0)
                    ]  #Adding offset (begin) if sequence is a subsequence
                    self.bad_seqs.append(
                        "tandem repeat too close to 3' end: %s" % (tandrep))

                else:
                    raise InputError("internal error in tandem repeat finding")

                self.score[0] += 10

            i += 1

    def hairpin_processing(self,
                           index,
                           lengths,
                           details,
                           l_threshold,
                           gc_threshold,
                           mfe,
                           rc=False):
        for i, l in enumerate(lengths):
            if l > l_threshold and details[i][3] < mfe and l_threshold < 16:
                hpinx = details[i][0]
                structure = details[i][1]
                gc = self.GC_count(hpinx, len(hpinx))
                if rc:
                    location = self.size - (index + details[i][2] + len(hpinx))
                else:
                    location = index + details[i][2]
                location_tup = (self.begin + location,
                                self.begin + location + len(hpinx))
                #print location_tup
                if gc > gc_threshold:
                    if hpinx not in self.h_info.keys():
                        self.h_info[hpinx] = {
                            "locations":
                            [(self.begin + location,
                              self.begin + location + len(hpinx))],
                            "types": ["strong hairpin"]
                        }  #Adding offset (begin) if sequence is a subsequence
                        self.h_loc_list += [
                            ((self.begin + location,
                              self.begin + location + len(hpinx)), 1.0)
                        ]  #Adding offset (begin) if sequence is a subsequence
                        self.h_count += 1
                        self.hairpins[hpinx] = (self.begin + location,
                                                structure)
                        self.score[1] += 10
                        #print "too long hairpin at %i: %s, %s"%(location,hpinx,structure)
                        self.bad_seqs.append("strong hairpin at %i: %s, %s" %
                                             (location, hpinx, structure))
                    else:
                        continue
            elif l > l_threshold and details[i][
                    3] < mfe and l_threshold > 16 and l_threshold < 20:
                hpinx = details[i][0]
                structure = details[i][1]
                gc = self.GC_count(hpinx, len(hpinx))
                if rc:
                    location = self.size - (index + details[i][2] + len(hpinx))
                else:
                    location = index + details[i][2]
                location_tup = (self.begin + location,
                                self.begin + location + len(hpinx))
                if hpinx not in self.h_info.keys():
                    self.h_info[hpinx] = {
                        "locations": [(self.begin + location,
                                       self.begin + location + len(hpinx))],
                        "types": ["long hairpin"]
                    }  #Adding offset (begin) if sequence is a subsequence
                    self.h_loc_list += [
                        ((self.begin + location,
                          self.begin + location + len(hpinx)), 1.0)
                    ]  #Adding offset (begin) if sequence is a subsequence
                    self.h_count += 1
                    self.hairpins[hpinx] = (self.begin + location, structure)
                    self.score[1] += 10
                    #print "too long hairpin at %i: %s, %s"%(location,hpinx,structure)
                    self.bad_seqs.append("long hairpin at %i: %s, %s" %
                                         (location, hpinx, structure))
                else:
                    continue
            elif l > l_threshold and details[i][3] < mfe and l_threshold > 20:
                hpinx = details[i][0]
                structure = details[i][1]
                gc = self.GC_count(hpinx, len(hpinx))
                if rc:
                    location = self.size - (index + details[i][2] + len(hpinx))
                else:
                    location = index + details[i][2]
                location_tup = (self.begin + location,
                                self.begin + location + len(hpinx))
                if hpinx not in self.h_info.keys():
                    self.h_info[hpinx] = {
                        "locations": [(self.begin + location,
                                       self.begin + location + len(hpinx))],
                        "types": ["long hairpin"]
                    }  #Adding offset (begin) if sequence is a subsequence
                    self.h_loc_list += [
                        ((self.begin + location,
                          self.begin + location + len(hpinx)), 1.0)
                    ]  #Adding offset (begin) if sequence is a subsequence
                    self.h_count += 1
                    self.hairpins[hpinx] = (self.begin + location, structure)
                    self.score[1] += 10
                    #print "too long hairpin at %i: %s, %s"%(location,hpinx,structure)
                    self.bad_seqs.append("long hairpin at %i: %s, %s" %
                                         (location, hpinx, structure))
            else:
                continue
        return

    def find_hairpins_windowed(self,
                               Hwindow=[50, 100, 50],
                               HL=[11, 17, 21],
                               HGC=0.8,
                               HMFE=[-10, -15, -20],
                               pal=11):

        self.h_count = 0
        self.h_loc_list = []
        self.h_info = {}
        self.hairpins = {}

        self.DNAmodel = PyVRNA(parameter_file="dna_mathews2004.par",
                               dangles=0,
                               noGU=True)  # initialize PyVRNA model object

        for j in range(self.size - 100):
            seq1 = self.sequence1[j:j + 100]
            seq2 = self.sequence2[self.size - (j + 100):self.size - j]
            for i, h in enumerate(Hwindow):
                lengths1, details1 = self.get_hairpin_lengths(seq1, h)
                lengths2, details2 = self.get_hairpin_lengths(seq2, h)
                self.hairpin_processing(j, lengths1, details1, HL[i], HGC,
                                        HMFE[i])
                self.hairpin_processing(j,
                                        lengths2,
                                        details2,
                                        HL[i],
                                        HGC,
                                        HMFE[i],
                                        rc=True)

        self.energies, self.max_bp_hairpin = self.get_windowed_DNA_structure_info(
            50)
        self.fast_hairpins(fast=False)

    def get_hairpin_lengths(self, seq, maxspan):
        # hairpin length subroutine
        # details = list of tuples (subsequence,substructure,start_pos of str)
        self.DNAmodel.settings.max_bp_span = maxspan
        fold = self.DNAmodel.RNAfold(seq)
        parsed_fold = self.DNAmodel.vienna2bp(fold.structure)

        bp_x = parsed_fold.bpx
        bp_y = parsed_fold.bpy
        lengths = []
        details = []

        while len(bp_x) > 0:
            indx = bisect(bp_x, bp_y[0])
            lengths.append(indx)  # save number of bp in each hairpin (height)
            details.append(
                (seq[bp_x[0] - 1:bp_y[0]], fold.structure[bp_x[0] - 1:bp_y[0]],
                 bp_x[0] - 1,
                 fold.energy))  # save string and location of each hairpin
            bp_x = bp_x[indx:]
            bp_y = bp_y[indx:]

        return lengths, details

    def _is_hairpin_pass(self, seq, stem, loop, max_mismatch, gc_high):
        #quickmode Haripin subroutine
        i = 0
        while i < len(seq) - (stem + loop + stem) + 1:
            j, mismatch, gc_count = 0, 0, 0.0

            while j < stem:

                if seq[i + j] in ['G', 'C']:
                    gc_count += 1.0

                if seq[i + (stem + loop + stem) - j - 1] in ['G', 'C']:
                    gc_count += 1.0

                if seq[i + j] != seq[i + (stem + loop + stem) - j -
                                     1].translate(self.comp_table):
                    mismatch += 1

                if mismatch > max_mismatch:
                    break

                j += 1

            if j == stem:
                gc_content = (gc_count / 2.0) / stem

                if gc_content >= gc_high:
                    hairpin = seq[i:i + (stem + loop + stem)]

                    if hairpin not in self.h_info.keys():

                        if gc_high > 0.50:
                            self.h_info[hairpin] = {
                                "locations":
                                [(self.begin + i, self.begin + i + (stem))],
                                "types": ["strong hairpin"]
                            }

                        else:
                            self.h_info[hairpin] = {
                                "locations":
                                [(self.begin + i, self.begin + i + (stem))],
                                "types": ["long hairpin"]
                            }

                    else:
                        self.h_info[hairpin]["locations"].append(
                            (self.begin + i, self.begin + i + (stem)))

                        if gc_high > 0.50:
                            self.h_info[hairpin]["types"].append(
                                "strong hairpin")

                        else:
                            self.h_info[hairpin]["types"].append(
                                "long hairpin")

                    self.h_loc_list += [((self.begin + i,
                                          self.begin + i + (stem)), 1.0)]
                    self.h_count += 1
                    self.score[1] += 10
                    self.bad_seqs.append(
                        "too long hairpin at position %i-%i:%s " %
                        (i, i +
                         (stem + loop + stem), seq[i:i +
                                                   (stem + loop + stem)]))

            i += 1

    def fast_hairpins(self, fast=True):
        if fast:
            # hairpin quickmode
            self.h_count = 0
            self.h_loc_list = []
            self.h_info = {}

        # Type 1: Stem = 11bp, Loop = 3bp to 48bp, Mismatches = 2, Stem must have GC-content >= 0.80
        seq = self.sequence1
        stem, max_mismatch, gc_high = 11, 2, 0.80

        for loop in xrange(3, 48 + 1):

            if len(seq) >= (stem + loop + stem):
                self._is_hairpin_pass(seq, stem, loop, max_mismatch, gc_high)

            else:
                break

        # Type 2: Stem = 17bp, Loop = 3bp to 100bp, Mismatches = 3
        stem, max_mismatch, gc_high = 17, 3, 0.0

        for loop in xrange(3, 100 + 1):

            if len(seq) >= (stem + loop + stem):
                self._is_hairpin_pass(seq, stem, loop, max_mismatch, gc_high)

            else:
                break

        if len(seq) > 500:
            # Type 3: Stem = 22bp, Loop = 100bp to 500bp, Mismatches = 5
            stem, max_mismatch, gc_high = 22, 5, 0.0

            for loop in xrange(100, 500 + 1):

                if len(seq) >= (stem + loop + stem):
                    self._is_hairpin_pass(seq, stem, loop, max_mismatch,
                                          gc_high)

                else:
                    break

        #Palindromes
        stem, loop, max_mismatch, gc_high = 11, 0, 1, 0.0

        if len(seq) >= (stem + loop + stem):
            self._is_hairpin_pass(seq, stem, loop, max_mismatch, gc_high)

    def get_windowed_DNA_structure_info(self, window_size):
        # hairpin visualization subroutine
        energies = []
        max_bp_hairpin = []

        for i in range(self.size - window_size):
            seq = self.sequence1[i:i + window_size]
            fold = self.DNAmodel.RNAfold(seq)
            mfe = fold.energy
            parsed_fold = self.DNAmodel.vienna2bp(fold.structure)
            bp_x = parsed_fold.bpx
            bp_y = parsed_fold.bpy
            lengths = []
            details = []

            while len(bp_x) > 0:
                indx = bisect(bp_x, bp_y[0])
                lengths.append(
                    indx)  # save number of bp in each hairpin (height)
                details.append(
                    (seq[bp_x[0] - 1:bp_y[0]],
                     fold.structure[bp_x[0] - 1:bp_y[0]],
                     bp_x[0] - 1))  # save string and location of each hairpin
                bp_x = bp_x[indx:]
                bp_y = bp_y[indx:]

            lengths = sorted(lengths)
            #print lengths

            if lengths:
                max_bp_hairpin.append(lengths[-1])

            else:
                max_bp_hairpin.append(0)

            energies.append(mfe)

        return energies, max_bp_hairpin

    def find_nucleotides(self,
                         GCwindow=[20, 100],
                         GC_max=0.67,
                         GC_low=[0.20, 0.30],
                         GC_high=[0.80, 0.70],
                         dGC=0.50,
                         T_window=50,
                         T_bounds=[0.30, 0.70],
                         cons={
                             'A': 13,
                             'C': 9,
                             'G': 9,
                             'T': 13
                         },
                         run_size=[3, 2],
                         run_count=[6, 10],
                         tm_bounds=[40, 70],
                         g_quad_motif="G{3,5}[ATGC]{2,7}",
                         i_motif="C{2,5}[ATGC]{2,7}"):
        ### Finds sequecnes that violate the following rules:
        # Greater than or equal to X consecutive bases within the sequence   "A=13 C=9 G=9 T=13" 10
        # Overall GC% is equal to or greater than X  x=67    10
        # GC% in any window of X  is less than or equal to  Y %  "X=20, Y=20; X=100, Y=30"   "3; 10"
        # GC% in any window of X bp is equal to or greater than Y %  "X=20, Y=80; X=100, Y=70"   "3;10"
        # GC content of terminal 50 bp is X % GC or greater  X=70    3
        # GC content of terminal 50 bp is X % GC or less X=30    3
        # No more than X consecutive  copies of a  Y nucleotide repeat are present   "X=6, Y=3; X=10, Y=2"   10
        # change in GC content no greater than 50% in 100 bp windows

        self.n_count = 0
        self.n_count_2 = 0
        self.n_loc_list = []
        self.n_info = {}

        self.dGC_list = []
        self.dGC_highlights = []

        self.dTm_list = []
        self.dTm_highlights = []

        GCcount = self.GC_count(self.sequence1, self.size)
        #print GCcount
        #GC_count=sum([1 for i in self.sequence1 if (i=='G' or i =='C')])/self.size

        if GCcount >= GC_max:
            #print "GC content of entire sequence too high"
            self.bad_seqs.append("GC content of entire sequence too high")
            self.score[2] += 10

        for key in cons:
            rule = "%s{%i,1000}" % (key, cons[key])
            self.reg_ex(rule, self.sequence1, "poly N run")

        for j in range(len(GCwindow)):

            if j == 0:
                self.tm_list = []
                self.GC_list = []

            for i in range(self.size - GCwindow[j]):
                seq = self.sequence1[i:i + GCwindow[j]]
                val = self.GC_count(seq, GCwindow[j])

                if j == 0:
                    tm = tm_calc(seq)
                    self.tm_list.append(tm)
                    self.GC_list.append(val)

                    if val <= GC_low[j]:
                        self.GC_report(i, GCwindow[j], seq, True, False)
                        self.score[2] += 3

                    if val >= GC_high[j]:
                        self.GC_report(i, GCwindow[j], seq, False, False)
                        self.score[2] += 3

                    if tm <= tm_bounds[0]:
                        self.GC_report(i, GCwindow[j], seq, True, True)
                        self.score[3] += 3

                    if tm >= tm_bounds[1]:
                        self.GC_report(i, GCwindow[j], seq, False, True)
                        self.score[3] += 3
                if val <= GC_low[j]:
                    self.score[2] += 10

                if val >= GC_high[j]:
                    self.score[2] += 10

        self.delta_GC_fxn(Tm_mode=False)
        self.delta_GC_fxn(Tm_mode=True)

        for i in range(T_window - 20):
            fiveterm = self.sequence1[i:i + 20]
            fiveGC = self.GC_count(fiveterm, 20)

            threeterm = self.sequence1[self.size - T_window + i:self.size -
                                       T_window + i + 20]
            threeGC = self.GC_count(threeterm, 20)

            if fiveGC < T_bounds[0]:
                seq = self.sequence1[i:i + 20]
                self.GC_report(i, 20, fiveterm, True, False, True)
                self.score[2] += 10

            if fiveGC > T_bounds[1]:
                seq = self.sequence1[i:i + 20]
                self.GC_report(i, 20, fiveterm, False, False, True)
                self.score[2] += 10

            if threeGC < T_bounds[0]:
                seq = self.sequence1[self.size - T_window + i:self.size -
                                     T_window + i + 20]
                self.GC_report(self.size - T_window + i, 20, threeterm, True,
                               False, True)
                self.score[2] += 10

            if threeGC > T_bounds[1]:
                seq = self.sequence1[self.size - T_window + i:self.size -
                                     T_window + i + 20]
                self.GC_report(self.size - T_window + i, 20, threeterm, False,
                               False, True)
                self.score[2] += 10

        for i in range(len(run_size)):
            merlist = mers(run_size[i])

            for j in merlist:
                rule = j * run_count[i]
                self.reg_ex(rule, self.sequence1, "mer_run")
        self.reg_ex(g_quad_motif * 3, self.sequence1, "g_quadruplex")
        self.reg_ex(i_motif * 3, self.sequence1, "i_motifs")

    def GC_report(self, i, window, seq, low=None, Tm=None, terminal=None):
        if low:
            nucs = "AT"
            report1 = "low"
        else:
            nucs = "GC"
            report1 = "high"
        if Tm:
            report2 = "Tm"
        else:
            report2 = "GC"
        if terminal:
            term = "terminal "
        else:
            term = ""
        #print len(seq)
        #print type(window)
        #print
        sub_seqs = [i + k for k in range(window) if seq[k] in nucs]
        if len(sub_seqs) == 0:
            print seq, window, nucs
        new_seqs = []
        five_p = sub_seqs[0]
        for n, s in enumerate(sub_seqs[1:]):
            #print s
            #print sub_seqs[n]+1
            if s == sub_seqs[n] + 1 and s != sub_seqs[-1]:
                continue
            elif s == sub_seqs[n] + 1 and s == sub_seqs[-1]:
                new_seqs.append(
                    (self.begin + five_p, self.begin + sub_seqs[-1]))
            else:
                new_seqs.append(
                    (self.begin + five_p, self.begin + sub_seqs[n]))
                five_p = s
                if s == sub_seqs[-1]:
                    new_seqs.append((self.begin + s, self.begin + s))
        #print sub_seqs
        #print new_seqs
        for k in new_seqs:
            if seq not in self.n_info.keys():
                self.n_info[seq] = {
                    "locations": [k],
                    "types": ["{}{} {}".format(term, report1, report2)]
                }

            else:
                self.n_info[seq]["locations"].append(k)
                self.n_info[seq]["types"].append("{}{} {}".format(
                    term, report1, report2))

            self.n_loc_list += [(k, 1.0)]
            self.n_count += 1
        self.n_count_2 += 1
        self.bad_seqs.append(
            "{} {} in a {} bp {}window at position {}: {}".format(
                report1, report2, window, term, i, seq))

    def delta_GC_fxn(self, Tm_mode=None, thresholds=[0.5, 30]):
        if Tm_mode:
            strtype = "Tm"
            feed_list = self.tm_list
            d_list = self.dTm_list
            highlights = self.dTm_highlights
            th = thresholds[1]
        else:
            strtype = "GC"
            feed_list = self.GC_list
            d_list = self.dGC_list
            highlights = self.dGC_highlights
            th = thresholds[0]
        last_seq = [None, None]
        for i in range(len(feed_list) - 80):
            local = feed_list[i:i + 80]
            hi = max(local)
            lo = min(local)
            d_list.append(hi - lo)
            if hi - lo > th:
                hi_seq = self.sequence[i + local.index(hi):i +
                                       local.index(hi) + 20]
                lo_seq = self.sequence[i + local.index(lo):i +
                                       local.index(lo) + 20]
                if hi_seq != last_seq[1] or lo_seq != last_seq[0]:
                    if i + local.index(hi) not in highlights:
                        highlights.append(i + local.index(hi))
                    if i + local.index(lo) not in highlights:
                        highlights.append(i + local.index(lo))
                    last_seq = [lo_seq, hi_seq]
                    seq = self.sequence1[i:i + 100]
                    self.score[2] += 10
                    sub_seqs_l = [
                        i + local.index(lo) + k for k in range(20)
                        if last_seq[0][k] in "AT"
                    ]
                    sub_seqs_h = [
                        i + local.index(hi) + k for k in range(20)
                        if last_seq[1][k] in "GC"
                    ]
                    new_seqs = []
                    five_p = sub_seqs_l[0]
                    for n, s in enumerate(sub_seqs_l[1:]):
                        #print s
                        #print sub_seqs[n]+1
                        if s == sub_seqs_l[n] + 1 and s != sub_seqs_l[-1]:
                            continue
                        elif s == sub_seqs_l[n] + 1 and s == sub_seqs_l[-1]:
                            new_seqs.append((self.begin + five_p,
                                             self.begin + sub_seqs_l[-1]))
                        else:
                            new_seqs.append((self.begin + five_p,
                                             self.begin + sub_seqs_l[n]))
                            five_p = s
                            if s == sub_seqs_l[-1]:
                                new_seqs.append(
                                    (self.begin + s, self.begin + s))
                    five_p = sub_seqs_h[0]
                    for n, s in enumerate(sub_seqs_h[1:]):
                        #print s
                        #print sub_seqs[n]+1
                        if s == sub_seqs_h[n] + 1 and s != sub_seqs_h[-1]:
                            continue
                        elif s == sub_seqs_h[n] + 1 and s == sub_seqs_h[-1]:
                            new_seqs.append((self.begin + five_p,
                                             self.begin + sub_seqs_h[-1]))
                        else:
                            new_seqs.append((self.begin + five_p,
                                             self.begin + sub_seqs_h[n]))
                            five_p = s
                            if s == sub_seqs_h[-1]:
                                new_seqs.append(
                                    (self.begin + s, self.begin + s))
                    #print sub_seqs
                    #print new_seqs
                    for k in new_seqs:
                        if seq not in self.n_info.keys():
                            self.n_info[seq] = {
                                "locations": [k],
                                "types": ["delta {}".format(strtype)]
                            }

                        else:
                            self.n_info[seq]["locations"].append(k)
                            self.n_info[seq]["types"].append(
                                "delta {}".format(strtype))

                        self.n_loc_list += [(k, 1.0)]
                        self.n_count += 1
                    self.n_count_2 += 1
                    self.bad_seqs.append(
                        "delta {} in a 100 bp window at position {}: {}, {}".
                        format(strtype, i, lo_seq, hi_seq))
        if Tm_mode:
            self.dTm_list = d_list
            self.dTm_highlights = highlights
        else:
            self.dGC_list = d_list
            self.dGC_highlights = highlights

    def reg_ex(self, rule, seq, ruletype):

        run = re.finditer(r"%s" % (rule), seq)
        m = [[n.start(), n.end()] for n in run]

        if m > 0:

            for r in m:
                run = self.sequence1[r[0]:r[1]]
                if run not in self.n_info.keys():
                    self.n_info[run] = {
                        "locations": [(self.begin + r[0], self.begin + r[1])],
                        "types": [ruletype]
                    }

                else:
                    self.n_info[run]["locations"].append(
                        (self.begin + r[0], self.begin + r[1]))
                    self.n_info[run]["types"].append(ruletype)

                self.n_loc_list += [((self.begin + r[0], self.begin + r[1]),
                                     1.0)]
                self.n_count += 1
                self.n_count_2 += 1
                # print "%s run at position %i"%(motif,r[0])
                self.bad_seqs.append("%s at position %i" % (ruletype, r[0]))
                self.score[2] += 10

    def GC_count(self, seq, window):
        counts = dict(collections.Counter(seq))

        if 'G' in counts.keys() and 'C' in counts.keys():
            val = float(counts['G'] + counts['C']) / window

        elif 'G' in counts.keys():
            val = float(counts['G']) / window

        elif 'C' in counts.keys():
            val = float(counts['C']) / window

        else:
            val = 0

        return val
Beispiel #4
0
import cPickle as pickle

# We're going to use shelve to store model predictions
# as a dictionary-like persistance object of pandas dataframes
import shelve

# Import models
import sys
sys.path.append('../models')

# Import private Salis Lab code (latest versions of RBS Calculator)
sys.path.append('/home/alex/Private-Code')
from DNAc import *

from PyVRNA import PyVRNA
RNAEnergyModel = PyVRNA(dangles=0)

handle = open('../models/rRNA_16S_3p_ends.p','r')
rRNA_16S_3p_ends = pickle.load(handle)
handle.close()

# Required function in RBS Calculators
def get_rRNA(organism):
    # temporary until I update database:
    if organism=="Corynebacterium glutamicum B-2784": organism = 'Corynebacterium glutamicum R'
    elif organism=="Pseudomonas fluorescens A506":    return 'ACCTCCTTT'
    elif organism=="Escherichia coli BL21(DE3)":      return 'ACCTCCTTA'
    else: pass
    return rRNA_16S_3p_ends[organism]

def find_best_start(mRNA, start_pos, predictions):