def __init__(self, abort_event, *args, **kwargs): MixtureFactory.__init__(self, abort_event, *args, **kwargs) MultiprocessingBase.__init__(self, abort_event) self._searcher = SearchEngine(self._abort_event) self._p_weights = [p.num_components for p in self._primers] self._pw_sum = sum(self._p_weights) self._num_p = len(self._primers)
def __init__(self, abort_event, reactions, concentrations, precision = 1e-10): MultiprocessingBase.__init__(self, abort_event) EquilibriumBase.__init__(self, reactions, concentrations, precision) #input parameters self.reactions = reactions self.concentrations = concentrations #group reactions by their connected graph components self._reactions_groups = self._group_reactions()
def __init__(self, abort_event, polymerase, with_exonuclease, num_cycles, ): MultiprocessingBase.__init__(self, abort_event) self._polymerase = polymerase self._with_exonuclease = with_exonuclease self._num_cycles = num_cycles self._PCR_P = deepcopy(tdf.PCR_P)
def __init__( self, abort_event, polymerase, with_exonuclease, num_cycles, ): MultiprocessingBase.__init__(self, abort_event) self._polymerase = polymerase self._with_exonuclease = with_exonuclease self._num_cycles = num_cycles self._PCR_P = deepcopy(tdf.PCR_P)
def __init__(self, abort_event, reactions, concentrations, precision=1e-10): MultiprocessingBase.__init__(self, abort_event) EquilibriumBase.__init__(self, reactions, concentrations, precision) #input parameters self.reactions = reactions self.concentrations = concentrations #group reactions by their connected graph components self._reactions_groups = self._group_reactions()
def __init__(self, abort_event, *args, **kwargs): iPCR_Interface.__init__(self, abort_event, *args, **kwargs) MultiprocessingBase.__init__(self, abort_event) self._blast_results = None self._bounds = None self._query = None #PCR parameters self._PCR_Simulations = dict() #query_id use hash of primers instead of all-config hash as in job_id self._primers_hash = (hash(tuple(self._primers)) & 0xFFFFFFF) self._query_id = re.split('_[0-9]+\Z', self._job_id)[0] self._query_id += '_%s' % str(self._primers_hash) #results self._results_filename = self._query_id+'-blast.xml' self._query_filename = self._query_id+'-blast.cfg' #reports self._hits_report_filename = '%s-%s-hits.txt' % (self._job_id, self._PCR_report_suffix.rstrip('-PCR')) #flags self._have_blast_results = False self._have_saved_results = False
def __init__(self, abort_event, job_id, primers): #initial check try: if len(primers) == 0: raise ValueError('AllSecStructures: no primers given.') except TypeError: raise TypeError(('AllSecStructures: primers should be an iterable. ' 'Given %s instead') % str(primers)) ReporterInterface.__init__(self) MultiprocessingBase.__init__(self, abort_event) #all primers list and concentrations dict self._primers = primers self._all_primers = [] self._concentrations = dict() self._reactions = dict() self._self = [] self._cross = [] self._all_structures = dict() self._equilibrium_concentrations = None #reports self._job_id = job_id self._short_structs_filename = job_id+'-structures-short.txt' self._full_structs_filename = job_id+'-structures-full.txt'
def __init__(self, abort_event, job_id, primers): #initial check try: if len(primers) == 0: raise ValueError('AllSecStructures: no primers given.') except TypeError: raise TypeError( ('AllSecStructures: primers should be an iterable. ' 'Given %s instead') % str(primers)) ReporterInterface.__init__(self) MultiprocessingBase.__init__(self, abort_event) #all primers list and concentrations dict self._primers = primers self._all_primers = [] self._concentrations = dict() self._reactions = dict() self._self = [] self._cross = [] self._all_structures = dict() self._equilibrium_concentrations = None #reports self._job_id = job_id self._short_structs_filename = job_id + '-structures-short.txt' self._full_structs_filename = job_id + '-structures-full.txt'
def __init__(self, abort_event): MultiprocessingBase.__init__(self, abort_event) self._a = 5
class SearchEngine(MultiprocessingBase): '''Fast search of a pattern sequence in a given set of template sequences with parallelization using multiprocessing.''' _w_3_0 = 1 #trivial 3d root of unity _w_3_1 = (-1/2.0+np.sqrt(3)/2.0j) #3d root of unity in a power of 1 _w_3_2 = (-1/2.0-np.sqrt(3)/2.0j) #3d root of unity in a power of 2 _unambiguous = array('b','ATGC') #ATGC characters as byte array #alphabet mappings to the 3d roots of unity for templates and patterns _T_AT_mapping = tuple(zip(_unambiguous, (_w_3_1,_w_3_2,0,0))) _T_GC_mapping = tuple(zip(_unambiguous, (0,0,_w_3_1,_w_3_2))) _P_AT_mapping = {'A':_w_3_2, 'T':_w_3_1, 'G':_w_3_0, 'C':_w_3_0, 'R':_w_3_2, 'Y':_w_3_1, 'S':_w_3_0, 'W':_w_3_2+_w_3_1, 'K':_w_3_1, 'M':_w_3_2, 'B':_w_3_1, 'D':_w_3_2+_w_3_1, 'H':_w_3_2+_w_3_1, 'V':_w_3_2, 'N':_w_3_2+_w_3_1} _P_GC_mapping = {'A':_w_3_0, 'T':_w_3_0, 'G':_w_3_2, 'C':_w_3_1, 'R':_w_3_2, 'Y':_w_3_1, 'S':_w_3_2+_w_3_1, 'W':_w_3_0, 'K':_w_3_2, 'M':_w_3_1, 'B':_w_3_2+_w_3_1, 'D':_w_3_2, 'H':_w_3_1, 'V':_w_3_2+_w_3_1, 'N':_w_3_2+_w_3_1} #template sequences grater than this value will be split into slices of #approximately that size _max_chunk_size = 2**12 ########################################################################### @classmethod def set_max_chunk(cls, chunk_size): cls._max_chunk_size = chunk_size @classmethod def _map_pattern(cls, pattern, map_len): '''Map pattern sequence to an alphabet of 3d roots of unity.''' #this naive algorithm works ~5 times faster #than the one used for template mapping due to short length of patterns AT_map = np.zeros(map_len, dtype=complex) GC_map = np.zeros(map_len, dtype=complex) for i,letter in enumerate(pattern): AT_map[i] = cls._P_AT_mapping[letter] GC_map[i] = cls._P_GC_mapping[letter] return (AT_map, GC_map) #end def @staticmethod def _compile_duplexes_for_position(position, template, primer, t_len, p_len, reverse): '''Given a template strand, a primer and a location where the primer matches the template, return a list of Duplexes formed by unambiguous components of the primer''' duplexes = [] for var in primer.seq_records: dup = Duplex(str(var.seq), str(template[position:position+p_len]), name=var.id, revcomp=True) if dup: duplexes.append(dup) if reverse: return t_len+1-(position+p_len), tuple(duplexes) else: return position+p_len, tuple(duplexes) #end def _compile_duplexes_mapper = staticmethod(MultiprocessingBase.data_mapper(_compile_duplexes_for_position.__func__)) @staticmethod @MultiprocessingBase.results_assembler def _duplexes_assembler(index, result, output): if result[1]: output.append(result) #end def def compile_duplexes_mp(self, counter, fwd_seq, rev_seq, primer, t_len, p_len, fwd_matches, rev_matches, num_jobs=None): '''Compile duplexes for both strands of a template in parallel''' if not len(fwd_matches)+len(rev_matches): return None counter.set_subwork(2, map(len, (fwd_matches, rev_matches))) if len(fwd_matches): counter[0].set_work(len(fwd_matches)) if len(rev_matches): counter[1].set_work(len(rev_matches)) #prepare and start two sets of jobs with tdf.AcquireParameters(): fwd_results = []; rev_results = [] fwd_work = self.Work(timeout=0.1, counter=counter[0]) fwd_work.start_work(self._compile_duplexes_mapper, fwd_matches, num_jobs, fwd_seq, primer, t_len, p_len, False) fwd_work.assemble(self._duplexes_assembler, fwd_results) rev_work = self.Work(timeout=0.1, counter=counter[1]) rev_work.start_work(self._compile_duplexes_mapper, rev_matches, num_jobs, rev_seq, primer, t_len, p_len, True) rev_work.assemble(self._duplexes_assembler, rev_results) if not self.wait(fwd_work, rev_work): return None if self.aborted() or not fwd_results and not rev_results: return None #sort duplexes by position and return them fwd_results.sort(key=lambda x: x[0]) rev_results.sort(key=lambda x: x[0]) return fwd_results,rev_results #end def def compile_duplexes(self, counter, fwd_seq, rev_seq, primer, t_len, p_len, fwd_matches, rev_matches): '''Compile duplexes for both strands of a template''' if not len(fwd_matches)+len(rev_matches): return None counter.set_work(len(fwd_matches)+len(rev_matches)) with tdf.AcquireParameters(): fwd_results = [] for pos in fwd_matches: if self.aborted(): break duplexes = self._compile_duplexes_for_position(pos, fwd_seq, primer, t_len, p_len, reverse=False) if duplexes[1]: fwd_results.append(duplexes) counter.count() rev_results = [] for pos in rev_matches: if self.aborted(): break duplexes = self._compile_duplexes_for_position(pos, rev_seq, primer, t_len, p_len, reverse=True) if duplexes[1]: rev_results.append(duplexes) counter.count() if self.aborted(): return None if not fwd_results: fwd_results = None if not rev_results: rev_results = None if not fwd_results and not rev_results: return None return fwd_results, rev_results #end def @classmethod def _find_in_chunk(cls, t_chunk, p_fft, correction, c_size, c_stride): '''Find number of matches of pattern at each position in a given chunk of a template. Pattern is given as a polynomial evaluated at n-th roots of unity using fft. map_len is a length of a map of template to the alphabet of 3-d roots of unity chunk to build. It's a power of 2 integer for fft to work fast. c_stride -- a part of chunk for which matches are calculated (it is less than map_len, so chunks overlap each other)''' t_AT_map = np.fromiter(array('b',t_chunk), dtype=complex) t_AT_map.resize(c_size) t_GC_map = t_AT_map.copy() for k,v in cls._T_AT_mapping: t_AT_map[t_AT_map == k] = v for k,v in cls._T_GC_mapping: t_GC_map[t_GC_map == k] = v AT_score = ifft(fft(t_AT_map[::-1])*p_fft[0])[::-1][:c_stride] GC_score = ifft(fft(t_GC_map[::-1])*p_fft[1])[::-1][:c_stride] score = AT_score.real + GC_score.real score = (score*(2.0/3.0) + correction) return score #end def @classmethod def _calculate_chunk_size(cls, t_len, p_len): rem = lambda(c): ((t_len/(c-p_len))*(c-p_len)+c-t_len) if t_len <= cls._max_chunk_size: chunk = 2**int(np.ceil(np.log2(t_len))) if chunk % t_len == 0: return chunk else: chunk = cls._max_chunk_size r = rem(chunk) min_chunk = 2**int(np.ceil(np.log2(2*p_len))) max_rem = chunk/2+1 while r > max_rem \ and chunk > min_chunk: chunk /= 2 r = rem(chunk) return max(chunk, min_chunk) #end def @staticmethod def _check_length_inequality(t_len, p_len): if t_len < p_len or p_len == 0: raise ValueError('SearchEngine._find: template sequence should be ' 'longer or equal to primer sequence and both ' 'should be grater than zero.') #end def @staticmethod def mp_better(t_len): #based on computation time statistics return cpu_count > 1 and t_len > 25000 #end def @staticmethod def _optimal_slices(t_len, p_len): #linear regression of measured computing time with respect to #number of slices and template length linear = max(cpu_count, int(t_len*1.75e-5 + 1.75)) return min(60, linear, t_len/p_len) #end def @staticmethod @MultiprocessingBase.worker def _find_in_slice(abort_e, _id, start, end, fwd_seq, rev_seq, p_fft, correction, s_stride, c_size, c_stride): fwd_score = [] rev_score = [] pos = start while pos < end: if aborted(abort_e): return None front = min(end, pos+c_size) fwd_score.append(SearchEngine._find_in_chunk(fwd_seq[pos:front], p_fft, correction, c_size, c_stride)) rev_score.append(SearchEngine._find_in_chunk(rev_seq[pos:front], p_fft, correction, c_size, c_stride)) pos += c_stride return (start, np.concatenate(fwd_score)[:s_stride], np.concatenate(rev_score)[:s_stride]) #end def def _find_mp(self, counter, template, primer, t_len, p_len, mismatches): '''Find all occurrences of a primer sequence in both strands of a template sequence with at most k mismatches. Multiprocessing version.''' slice_size = t_len/self._optimal_slices(t_len, p_len)+p_len+1 slice_stride = slice_size-p_len chunk_size = self._calculate_chunk_size(slice_size, p_len) chunk_stride = chunk_size-p_len p_maps = self._map_pattern(str(primer.master_sequence.seq), chunk_size) p_fft = (fft(p_maps[0]),fft(p_maps[1])) fwd_seq = str(template.seq) rev_seq = reverse_complement(fwd_seq) correction = np.ndarray(chunk_stride); correction.fill(p_len/3.0) #start find_in_slice jobs counter.set_work(t_len/slice_stride+1) pos = 0; work = self.Work(counter=counter) while pos < t_len and not self.aborted(): front = min(t_len, pos+slice_size) queue = self._Queue() job = self._Process(target=self._find_in_slice, args=(queue, self._abort_event, len(work), pos, front, fwd_seq, rev_seq, p_fft, correction, slice_stride, chunk_size, chunk_stride)) job.daemon = 1 job.start() work.add_job(job, queue) pos += slice_stride work.start_jobs() #if scores arrays are allocated beforehand, the memory #is returned upon deletion scores_len = slice_stride*len(work) scores = [np.zeros(scores_len), np.zeros(scores_len)] def assembler(out, scores): if out: scores[0][out[0]:out[0]+slice_stride] = out[1] scores[1][out[0]:out[0]+slice_stride] = out[2] work.assemble(assembler, scores) if not work.wait() or self.aborted(): return None #compute match indices matches = max(1, p_len - mismatches)-0.5 fwd_matches = np.where(scores[0][:t_len-p_len+1] >= matches)[0] rev_matches = np.where(scores[1][:t_len-p_len+1] >= matches)[0] return fwd_seq, rev_seq, primer, t_len, p_len, fwd_matches, rev_matches #end def def _find(self, counter, template, primer, t_len, p_len, mismatches): '''Find all occurrences of a primer sequence in both strands of a template sequence with at most k mismatches.''' chunk_size = self._calculate_chunk_size(t_len, p_len) chunk_stride = chunk_size-p_len p_maps = self._map_pattern(str(primer.master_sequence.seq), chunk_size) p_fft = (fft(p_maps[0]),fft(p_maps[1])) fwd_seq = str(template.seq) rev_seq = reverse_complement(fwd_seq) correction = np.ndarray(chunk_stride); correction.fill(p_len/3.0) fwd_score = np.zeros(t_len+chunk_stride) rev_score = np.zeros(t_len+chunk_stride) #_find in chunks of a template, which is faster due to lower cost of memory allocation pos = 0; counter.set_work(chunk_stride*(t_len/chunk_stride+1)) while pos < t_len and not self.aborted(): front = min(t_len, pos+chunk_size) fwd_score[pos:pos+chunk_stride] = self._find_in_chunk(fwd_seq[pos:front], p_fft, correction, chunk_size, chunk_stride) rev_score[pos:pos+chunk_stride] = self._find_in_chunk(rev_seq[pos:front], p_fft, correction, chunk_size, chunk_stride) counter.count(chunk_stride) pos += chunk_stride #if search was aborted, return empty results if self.aborted(): return None #match indexes matches = max(1, p_len - mismatches)-0.5 fwd_matches = np.where(fwd_score[:t_len-p_len+1] >= matches)[0]; del fwd_score rev_matches = np.where(rev_score[:t_len-p_len+1] >= matches)[0]; del rev_score return fwd_seq, rev_seq, primer, t_len, p_len, fwd_matches, rev_matches #end def def find_matches(self, counter, template, primer, mismatches): '''Find all occurrences of a primer sequence in both strands of a template sequence with at most k mismatches. This method uses multiprocessing to speedup the search process. Use it to perform search in a long sequence.''' p_len,t_len = len(primer),len(template) self._check_length_inequality(t_len, p_len) if self.mp_better(t_len): find = self._find_mp else: find = self._find return find(counter, template, primer, t_len, p_len, mismatches) #end def def find(self, counter, template, primer, mismatches): '''Find occurrences of a degenerate primer in a template sequence. Return positions and Duplexes formed. This method uses multiprocessing to speedup the search process. Use it to perform search in a long sequence.''' t_len = len(template) counter.set_subwork(2, (t_len, t_len*primer.num_components)) matches = self.find_matches(counter[0], template, primer, mismatches) if matches is None: return None duplexes = self.compile_duplexes_mp(counter[1], *matches) return to_shelf(duplexes) if duplexes else None #end def @MultiprocessingBase.data_mapper_method def _batch_find(self, template, primer, p_len, mismatches): matches = self._find(WorkCounter(), template, primer, len(template), p_len, mismatches) if matches is None: return None duplexes = self.compile_duplexes(WorkCounter(), *matches) return (template.id, to_shelf(duplexes)) if duplexes else None #end def def batch_find(self, counter, templates, primer, mismatches, **kwargs): '''Find occurrences of a degenerate primer in each of the provided templates. Return dictionary of results using template IDs as keys. It uses multiprocessing to parallelize searches, each of which does not use parallelization. Use it to search in many short sequences.''' work = self.Work(0.1, counter=counter) work.start_work(self._batch_find, templates, None, primer, len(primer), mismatches, **kwargs) @MultiprocessingBase.results_assembler def assembler(index, result, results): if result: results[result[0]] = result[1] results = {} work.assemble(assembler, results) if not work.wait(): return None return results