예제 #1
0
 def __init__(self, abort_event, *args, **kwargs):
     MixtureFactory.__init__(self, abort_event, *args, **kwargs)
     MultiprocessingBase.__init__(self, abort_event)
     self._searcher  = SearchEngine(self._abort_event)
     self._p_weights = [p.num_components for p in self._primers]
     self._pw_sum    = sum(self._p_weights)
     self._num_p     = len(self._primers)
 def __init__(self, abort_event, *args, **kwargs):
     MixtureFactory.__init__(self, abort_event, *args, **kwargs)
     MultiprocessingBase.__init__(self, abort_event)
     self._searcher = SearchEngine(self._abort_event)
     self._p_weights = [p.num_components for p in self._primers]
     self._pw_sum = sum(self._p_weights)
     self._num_p = len(self._primers)
예제 #3
0
 def __init__(self, abort_event, reactions, concentrations, precision = 1e-10):
     MultiprocessingBase.__init__(self, abort_event)
     EquilibriumBase.__init__(self, reactions, concentrations, precision)
     #input parameters
     self.reactions         = reactions
     self.concentrations    = concentrations
     #group reactions by their connected graph components 
     self._reactions_groups = self._group_reactions()
예제 #4
0
 def __init__(self, 
               abort_event,
               polymerase,
               with_exonuclease,
               num_cycles,
               ):
     MultiprocessingBase.__init__(self, abort_event)
     self._polymerase       = polymerase
     self._with_exonuclease = with_exonuclease
     self._num_cycles       = num_cycles
     self._PCR_P            = deepcopy(tdf.PCR_P)
예제 #5
0
 def __init__(
     self,
     abort_event,
     polymerase,
     with_exonuclease,
     num_cycles,
 ):
     MultiprocessingBase.__init__(self, abort_event)
     self._polymerase = polymerase
     self._with_exonuclease = with_exonuclease
     self._num_cycles = num_cycles
     self._PCR_P = deepcopy(tdf.PCR_P)
예제 #6
0
 def __init__(self,
              abort_event,
              reactions,
              concentrations,
              precision=1e-10):
     MultiprocessingBase.__init__(self, abort_event)
     EquilibriumBase.__init__(self, reactions, concentrations, precision)
     #input parameters
     self.reactions = reactions
     self.concentrations = concentrations
     #group reactions by their connected graph components
     self._reactions_groups = self._group_reactions()
예제 #7
0
 def __init__(self, abort_event, *args, **kwargs):
     iPCR_Interface.__init__(self, abort_event, *args, **kwargs)
     MultiprocessingBase.__init__(self, abort_event) 
     self._blast_results    = None
     self._bounds           = None
     self._query            = None
     #PCR parameters
     self._PCR_Simulations  = dict()
     #query_id use hash of primers instead of all-config hash as in job_id
     self._primers_hash = (hash(tuple(self._primers)) & 0xFFFFFFF)
     self._query_id  = re.split('_[0-9]+\Z', self._job_id)[0]
     self._query_id += '_%s' % str(self._primers_hash)
     #results
     self._results_filename = self._query_id+'-blast.xml'
     self._query_filename   = self._query_id+'-blast.cfg'
     #reports
     self._hits_report_filename = '%s-%s-hits.txt' % (self._job_id, 
                                                      self._PCR_report_suffix.rstrip('-PCR'))
     #flags
     self._have_blast_results = False
     self._have_saved_results = False
예제 #8
0
 def __init__(self, abort_event, job_id, primers):
     #initial check
     try:
         if len(primers) == 0: 
             raise ValueError('AllSecStructures: no primers given.')
     except TypeError:
         raise TypeError(('AllSecStructures: primers should be an iterable. '
                         'Given %s instead') % str(primers))
     ReporterInterface.__init__(self)
     MultiprocessingBase.__init__(self, abort_event)
     #all primers list and concentrations dict
     self._primers        = primers
     self._all_primers    = []
     self._concentrations = dict()
     self._reactions      = dict()
     self._self           = []
     self._cross          = []
     self._all_structures = dict()
     self._equilibrium_concentrations = None
     #reports
     self._job_id = job_id
     self._short_structs_filename = job_id+'-structures-short.txt'
     self._full_structs_filename  = job_id+'-structures-full.txt'
예제 #9
0
 def __init__(self, abort_event, job_id, primers):
     #initial check
     try:
         if len(primers) == 0:
             raise ValueError('AllSecStructures: no primers given.')
     except TypeError:
         raise TypeError(
             ('AllSecStructures: primers should be an iterable. '
              'Given %s instead') % str(primers))
     ReporterInterface.__init__(self)
     MultiprocessingBase.__init__(self, abort_event)
     #all primers list and concentrations dict
     self._primers = primers
     self._all_primers = []
     self._concentrations = dict()
     self._reactions = dict()
     self._self = []
     self._cross = []
     self._all_structures = dict()
     self._equilibrium_concentrations = None
     #reports
     self._job_id = job_id
     self._short_structs_filename = job_id + '-structures-short.txt'
     self._full_structs_filename = job_id + '-structures-full.txt'
예제 #10
0
 def __init__(self, abort_event):
     MultiprocessingBase.__init__(self, abort_event)
     self._a = 5
예제 #11
0
class SearchEngine(MultiprocessingBase):
    '''Fast search of a pattern sequence in a given set of template sequences 
    with parallelization using multiprocessing.'''
    
    _w_3_0 = 1                        #trivial 3d root of unity 
    _w_3_1 = (-1/2.0+np.sqrt(3)/2.0j) #3d root of unity in a power of 1
    _w_3_2 = (-1/2.0-np.sqrt(3)/2.0j) #3d root of unity in a power of 2
    
    _unambiguous = array('b','ATGC')  #ATGC characters as byte array
    
    #alphabet mappings to the 3d roots of unity for templates and patterns
    _T_AT_mapping = tuple(zip(_unambiguous, (_w_3_1,_w_3_2,0,0))) 
    
    _T_GC_mapping = tuple(zip(_unambiguous, (0,0,_w_3_1,_w_3_2)))

    _P_AT_mapping = {'A':_w_3_2,
                     'T':_w_3_1,
                     'G':_w_3_0,
                     'C':_w_3_0,
                     
                     'R':_w_3_2,
                     'Y':_w_3_1,
                     'S':_w_3_0,
                     'W':_w_3_2+_w_3_1,
                     'K':_w_3_1,
                     'M':_w_3_2,
                     'B':_w_3_1,
                     'D':_w_3_2+_w_3_1,
                     'H':_w_3_2+_w_3_1,
                     'V':_w_3_2,
                     'N':_w_3_2+_w_3_1}
    
    _P_GC_mapping = {'A':_w_3_0,
                     'T':_w_3_0,
                     'G':_w_3_2,
                     'C':_w_3_1,
                     
                     'R':_w_3_2,
                     'Y':_w_3_1,
                     'S':_w_3_2+_w_3_1,
                     'W':_w_3_0,
                     'K':_w_3_2,
                     'M':_w_3_1,
                     'B':_w_3_2+_w_3_1,
                     'D':_w_3_2,
                     'H':_w_3_1,
                     'V':_w_3_2+_w_3_1,
                     'N':_w_3_2+_w_3_1}
    
    #template sequences grater than this value will be split into slices of 
    #approximately that size 
    _max_chunk_size = 2**12
    ###########################################################################


    @classmethod
    def set_max_chunk(cls, chunk_size):
        cls._max_chunk_size = chunk_size
    
   
    @classmethod
    def _map_pattern(cls, pattern, map_len):
        '''Map pattern sequence to an alphabet of 3d roots of unity.'''
        #this naive algorithm works ~5 times faster 
        #than the one used for template mapping due to short length of patterns
        AT_map = np.zeros(map_len, dtype=complex)
        GC_map = np.zeros(map_len, dtype=complex)
        for i,letter in enumerate(pattern):
            AT_map[i] = cls._P_AT_mapping[letter]
            GC_map[i] = cls._P_GC_mapping[letter]
        return (AT_map, GC_map)
    #end def
    

    @staticmethod
    def _compile_duplexes_for_position(position, template, primer, 
                                           t_len, p_len, reverse):
        '''Given a template strand, a primer and a location where the 
        primer matches the template, return a list of Duplexes formed by 
        unambiguous components of the primer'''
        duplexes = []
        for var in primer.seq_records:
            dup = Duplex(str(var.seq), str(template[position:position+p_len]), name=var.id, revcomp=True)
            if dup: duplexes.append(dup)
        if reverse: return t_len+1-(position+p_len), tuple(duplexes)
        else: return position+p_len, tuple(duplexes)
    #end def
    _compile_duplexes_mapper = staticmethod(MultiprocessingBase.data_mapper(_compile_duplexes_for_position.__func__)) 
    
    
    @staticmethod
    @MultiprocessingBase.results_assembler
    def _duplexes_assembler(index, result, output):
        if result[1]: output.append(result)
    #end def
    

    def compile_duplexes_mp(self, counter,  
                              fwd_seq, rev_seq, primer, 
                              t_len, p_len,
                              fwd_matches, rev_matches, num_jobs=None):
        '''Compile duplexes for both strands of a template in parallel'''
        if not len(fwd_matches)+len(rev_matches): return None
        counter.set_subwork(2, map(len, (fwd_matches, rev_matches)))
        if len(fwd_matches): counter[0].set_work(len(fwd_matches))
        if len(rev_matches): counter[1].set_work(len(rev_matches))
        #prepare and start two sets of jobs
        with tdf.AcquireParameters():
            fwd_results = []; rev_results = []
            fwd_work = self.Work(timeout=0.1, counter=counter[0])
            fwd_work.start_work(self._compile_duplexes_mapper, 
                                  fwd_matches, num_jobs, 
                                  fwd_seq, primer, t_len, p_len, False)
            fwd_work.assemble(self._duplexes_assembler, fwd_results)
            rev_work = self.Work(timeout=0.1, counter=counter[1])
            rev_work.start_work(self._compile_duplexes_mapper, 
                                  rev_matches, num_jobs, 
                                  rev_seq, primer, t_len, p_len, True)
            rev_work.assemble(self._duplexes_assembler, rev_results)
            if not self.wait(fwd_work, rev_work): return None
        if self.aborted() or not fwd_results and not rev_results: return None
        #sort duplexes by position and return them
        fwd_results.sort(key=lambda x: x[0])
        rev_results.sort(key=lambda x: x[0])
        return fwd_results,rev_results
    #end def
    
    
    def compile_duplexes(self, counter,
                           fwd_seq, rev_seq, primer, 
                           t_len, p_len,
                           fwd_matches, rev_matches):
        '''Compile duplexes for both strands of a template'''
        if not len(fwd_matches)+len(rev_matches): return None
        counter.set_work(len(fwd_matches)+len(rev_matches))
        with tdf.AcquireParameters():
            fwd_results = []
            for pos in fwd_matches:
                if self.aborted(): break
                duplexes = self._compile_duplexes_for_position(pos, fwd_seq, primer, 
                                                               t_len, p_len, reverse=False)
                if duplexes[1]: fwd_results.append(duplexes)
                counter.count()
            rev_results = []
            for pos in rev_matches:
                if self.aborted(): break
                duplexes = self._compile_duplexes_for_position(pos, rev_seq, primer, 
                                                               t_len, p_len, reverse=True)
                if duplexes[1]: rev_results.append(duplexes)
                counter.count()
        if self.aborted(): return None
        if not fwd_results: fwd_results = None
        if not rev_results: rev_results = None
        if not fwd_results and not rev_results: return None
        return fwd_results, rev_results
    #end def
        
    
    @classmethod
    def _find_in_chunk(cls, t_chunk, p_fft, correction, c_size, c_stride):
        '''Find number of matches of pattern at each position in a given 
        chunk of a template.
        Pattern is given as a polynomial evaluated at n-th roots of unity 
        using fft.
        map_len is a length of a map of template to the alphabet of 3-d roots 
        of unity chunk to build. It's a power of 2 integer for fft to work fast.
        c_stride -- a part of chunk for which matches are calculated 
        (it is less than map_len, so chunks overlap each other)'''
        t_AT_map = np.fromiter(array('b',t_chunk), dtype=complex)
        t_AT_map.resize(c_size)
        t_GC_map = t_AT_map.copy()
        for k,v in cls._T_AT_mapping: t_AT_map[t_AT_map == k] = v
        for k,v in cls._T_GC_mapping: t_GC_map[t_GC_map == k] = v
        AT_score = ifft(fft(t_AT_map[::-1])*p_fft[0])[::-1][:c_stride]
        GC_score = ifft(fft(t_GC_map[::-1])*p_fft[1])[::-1][:c_stride]
        score    = AT_score.real + GC_score.real
        score    = (score*(2.0/3.0) + correction)
        return score
    #end def
    
    
    @classmethod
    def _calculate_chunk_size(cls, t_len, p_len):
        rem = lambda(c): ((t_len/(c-p_len))*(c-p_len)+c-t_len)
        if t_len <= cls._max_chunk_size:
            chunk = 2**int(np.ceil(np.log2(t_len)))
            if chunk % t_len == 0: return chunk
        else: chunk = cls._max_chunk_size
        r = rem(chunk)
        min_chunk = 2**int(np.ceil(np.log2(2*p_len)))
        max_rem   = chunk/2+1
        while r > max_rem \
        and chunk > min_chunk:
            chunk /= 2
            r = rem(chunk)
        return max(chunk, min_chunk)
    #end def
    
    
    @staticmethod
    def _check_length_inequality(t_len, p_len):
        if t_len < p_len or p_len == 0:
            raise ValueError('SearchEngine._find: template sequence should be '
                             'longer or equal to primer sequence and both '
                             'should be grater than zero.')
    #end def
    
    
    @staticmethod
    def mp_better(t_len):
        #based on computation time statistics
        return cpu_count > 1 and t_len > 25000
    #end def
    
    
    @staticmethod
    def _optimal_slices(t_len, p_len):
        #linear regression of measured computing time with respect to 
        #number of slices and template length
        linear = max(cpu_count, int(t_len*1.75e-5 + 1.75)) 
        return min(60, linear, t_len/p_len)
    #end def
    
    
    @staticmethod
    @MultiprocessingBase.worker
    def _find_in_slice(abort_e, _id, start, end, fwd_seq, rev_seq, 
                         p_fft, correction, s_stride, c_size, c_stride):
        fwd_score = []
        rev_score = []
        pos       = start
        while pos < end:
            if aborted(abort_e): return None
            front = min(end, pos+c_size)
            fwd_score.append(SearchEngine._find_in_chunk(fwd_seq[pos:front], 
                                                         p_fft, correction,
                                                         c_size, c_stride))
            rev_score.append(SearchEngine._find_in_chunk(rev_seq[pos:front], 
                                                         p_fft, correction,
                                                         c_size, c_stride))
            pos += c_stride
        return (start, 
                np.concatenate(fwd_score)[:s_stride], 
                np.concatenate(rev_score)[:s_stride])
    #end def
    

    def _find_mp(self, counter, template, primer, t_len, p_len, mismatches):
        '''Find all occurrences of a primer sequence in both strands of a 
        template sequence with at most k mismatches. Multiprocessing version.'''
        slice_size   = t_len/self._optimal_slices(t_len, p_len)+p_len+1
        slice_stride = slice_size-p_len
        chunk_size   = self._calculate_chunk_size(slice_size, p_len)
        chunk_stride = chunk_size-p_len
        p_maps       = self._map_pattern(str(primer.master_sequence.seq), chunk_size)
        p_fft        = (fft(p_maps[0]),fft(p_maps[1]))
        fwd_seq      = str(template.seq)
        rev_seq      = reverse_complement(fwd_seq)
        correction   = np.ndarray(chunk_stride); correction.fill(p_len/3.0)
        #start find_in_slice jobs
        counter.set_work(t_len/slice_stride+1)
        pos = 0; work = self.Work(counter=counter)
        while pos < t_len and not self.aborted():
            front = min(t_len, pos+slice_size)
            queue = self._Queue()
            job = self._Process(target=self._find_in_slice, 
                                args=(queue, self._abort_event, 
                                      len(work), 
                                      pos, front, 
                                      fwd_seq, rev_seq, 
                                      p_fft, correction,  
                                      slice_stride, chunk_size, chunk_stride))
            job.daemon = 1
            job.start()
            work.add_job(job, queue)
            pos += slice_stride
        work.start_jobs()
        #if scores arrays are allocated beforehand, the memory
        #is returned upon deletion
        scores_len = slice_stride*len(work)
        scores = [np.zeros(scores_len), np.zeros(scores_len)]
        def assembler(out, scores):
            if out:
                scores[0][out[0]:out[0]+slice_stride] = out[1]
                scores[1][out[0]:out[0]+slice_stride] = out[2]
        work.assemble(assembler, scores)
        if not work.wait() or self.aborted(): return None
        #compute match indices
        matches     = max(1, p_len - mismatches)-0.5
        fwd_matches = np.where(scores[0][:t_len-p_len+1] >= matches)[0]
        rev_matches = np.where(scores[1][:t_len-p_len+1] >= matches)[0] 
        return fwd_seq, rev_seq, primer, t_len, p_len, fwd_matches, rev_matches
    #end def

    
    def _find(self, counter, template, primer, t_len, p_len, mismatches):
        '''Find all occurrences of a primer sequence in both strands of a 
        template sequence with at most k mismatches.'''
        chunk_size   = self._calculate_chunk_size(t_len, p_len)
        chunk_stride = chunk_size-p_len
        p_maps       = self._map_pattern(str(primer.master_sequence.seq), chunk_size)
        p_fft        = (fft(p_maps[0]),fft(p_maps[1]))
        fwd_seq      = str(template.seq)
        rev_seq      = reverse_complement(fwd_seq)
        correction   = np.ndarray(chunk_stride); correction.fill(p_len/3.0)
        fwd_score    = np.zeros(t_len+chunk_stride)
        rev_score    = np.zeros(t_len+chunk_stride)
        #_find in chunks of a template, which is faster due to lower cost of memory allocation
        pos = 0; counter.set_work(chunk_stride*(t_len/chunk_stride+1))
        while pos < t_len and not self.aborted():
            front = min(t_len, pos+chunk_size)
            fwd_score[pos:pos+chunk_stride] = self._find_in_chunk(fwd_seq[pos:front], 
                                                                  p_fft, correction, 
                                                                  chunk_size, chunk_stride) 
            rev_score[pos:pos+chunk_stride] = self._find_in_chunk(rev_seq[pos:front], 
                                                                  p_fft, correction, 
                                                                  chunk_size, chunk_stride)
            counter.count(chunk_stride)
            pos += chunk_stride
        #if search was aborted, return empty results
        if self.aborted(): return None
        #match indexes
        matches     = max(1, p_len - mismatches)-0.5
        fwd_matches = np.where(fwd_score[:t_len-p_len+1] >= matches)[0]; del fwd_score
        rev_matches = np.where(rev_score[:t_len-p_len+1] >= matches)[0]; del rev_score
        return fwd_seq, rev_seq, primer, t_len, p_len, fwd_matches, rev_matches
    #end def
    

    def find_matches(self, counter, template, primer, mismatches):
        '''Find all occurrences of a primer sequence in both strands of a 
        template sequence with at most k mismatches. This method uses 
        multiprocessing to speedup the search process. Use it to perform 
        search in a long sequence.''' 
        p_len,t_len = len(primer),len(template)
        self._check_length_inequality(t_len, p_len)
        if self.mp_better(t_len): find = self._find_mp
        else: find = self._find
        return find(counter, template, primer, t_len, p_len, mismatches)
    #end def
    

    def find(self, counter, template, primer, mismatches):
        '''Find occurrences of a degenerate primer in a template sequence.
        Return positions and Duplexes formed. This method uses 
        multiprocessing to speedup the search process. Use it to perform 
        search in a long sequence.'''
        t_len = len(template)
        counter.set_subwork(2, (t_len, t_len*primer.num_components))
        matches = self.find_matches(counter[0], template, primer, mismatches)
        if matches is None: return None
        duplexes = self.compile_duplexes_mp(counter[1], *matches)
        return to_shelf(duplexes) if duplexes else None
    #end def
    
    @MultiprocessingBase.data_mapper_method
    def _batch_find(self, template, primer, p_len, mismatches):
        matches = self._find(WorkCounter(), template, primer, len(template), p_len, mismatches)
        if matches is None: return None
        duplexes = self.compile_duplexes(WorkCounter(), *matches) 
        return (template.id, to_shelf(duplexes)) if duplexes else None 
    #end def
    
    def batch_find(self, counter, templates, primer, mismatches, **kwargs):
        '''Find occurrences of a degenerate primer in each of the provided 
        templates. Return dictionary of results using template IDs as keys.
        It uses multiprocessing to parallelize searches, each of which does 
        not use parallelization. Use it to search in many short sequences.'''
        work = self.Work(0.1, counter=counter)
        work.start_work(self._batch_find, templates, None,
                          primer, len(primer), mismatches, **kwargs)
        @MultiprocessingBase.results_assembler
        def assembler(index, result, results):
            if result: results[result[0]] = result[1]
        results = {}
        work.assemble(assembler, results)
        if not work.wait(): return None
        return results
예제 #12
0
 def __init__(self, abort_event):
     MultiprocessingBase.__init__(self, abort_event)
     self._a = 5