def __getitem__(self, x: Union[int, slice]) -> 'DSeq': fwd: str = self.fwd rc_rev: str = self.rc_rev overhang: int = self.overhang if not isinstance(x, slice): sl: slice = slice(x, x + 1, 1) else: sl: slice = x start_idx: int = sl.start or 0 if overhang > 0: # fwd shifted in 3' direction fwd_sl_stop: int = sl.stop - overhang if sl.stop is not None else None fwd_sl: slice = slice(start_idx - overhang, fwd_sl_stop, sl.step) fwd_out: str = fwd[fwd_sl] rev_out: str = reverseComplement(rc_rev[sl]) overhang_out: int = max(overhang - start_idx, 0) else: # fwd shited in the 5' direction relative to the reverse (negative overhang) rev_sl_stop: int = sl.stop + overhang if sl.stop is not None else None rev_sl: slice = slice(start_idx + overhang, rev_sl_stop, sl.step) rev_out: str = reverseComplement(rc_rev[rev_sl]) fwd_out: str = fwd[sl] overhang_out: int = min(start_idx + overhang, 0) return DSeq(fwd_out, rev_out, overhang_out, alphabet=self.alphabet)
def __add__(self, b: 'VirtualHelix') -> 'VirtualHelix': '''(1) Concatenates the forward strand with forward strand and the reverse strand with the reverse strand and preserves order (2) Realligns the two :class:`VirtualHelix` objects involved Args: b: a :class:`VirtualHelix` object Returns: a :class:`VirtualHelix` object Raises: ValueError, TypeError ''' if isinstance(b, VirtualHelix): type3, seq3 = self.three_prime_end() type5, seq5 = b.five_prime_end() if type3 == type5 and len(seq3) == len(seq5): if seq3 != reverseComplement(seq5): raise TypeError("Ends not complimentary") fwd = self.fwd + b.fwd rev = self.rev + b.rev return VirtualHelix(fwd, rev, self.overhang) else: raise TypeError("Ends not compatible") else: raise ValueError("{} object not a DSeq".format(b))
def __add__(self, b: 'DSeq') -> 'DSeq': '''(1) Concatenates the forward strand with forward strand and the reverse strand with the reverse strand and preserves order (2) Realligns the two :class:`DSeq` objects involved. Args: b: a :class:`DSeq` object Returns: a :class:`DSeq` object Raises: ValueError, TypeError ''' if isinstance(b, DSeq): if self.is_circular or b.is_circular: err: str = "Can't concatenate circular DSeq: {} + {}" raise TypeError(err.format(self, b)) type3, seq3 = self.three_prime_end() type5, seq5 = b.five_prime_end() if type3 == type5 and len(seq3) == len(seq5): if seq3 != reverseComplement(seq5): raise TypeError("Ends not complimentary") fwd = self.fwd + b.fwd rev = self.rev + b.rev return DSeq(fwd, rev, self.overhang) else: raise TypeError("Ends not compatible") else: raise ValueError("{} object not a DSeq".format(b))
def test_revCompProfile(self): py_time = timeit.timeit(lambda: reverseComplement("ACGTUMRWSYKVHDBNACGTUMRWSYKVHDBN"), number=10000) capi_time = timeit.timeit(lambda: seqstr.reverseComplement( "ACGTUMRWSYKVHDBNACGTUMRWSYKVHDBN"), number=10000) print('\nNative python rev_comp (time for 10000X):', py_time) print('C API python rev_comp (time for 10000X): ', capi_time)
def test_revCompProfile(self): py_time = timeit.timeit( lambda: reverseComplement("ACGTUMRWSYKVHDBNACGTUMRWSYKVHDBN"), number=10000) capi_time = timeit.timeit(lambda: seqstr.reverseComplement( "ACGTUMRWSYKVHDBNACGTUMRWSYKVHDBN"), number=10000) print('\nNative python rev_comp (time for 10000X):', py_time) print('C API python rev_comp (time for 10000X): ', capi_time)
def isCircularizable(self) -> bool: ''' Returns: False if already circular or if ends don't match ''' if self.is_circular: return False else: type3, seq3 = self.three_prime_end() type5, seq5 = self.five_prime_end() return type3 == type5 and seq3 == reverseComplement(seq5)
def checkConstraintFailedCount(candidate, idxs, seq_set): failed = 0 for idx in idxs: seq = seq_set[idx] if ss.hammingDistance(candidate, seq) < 5: # if hamming(candidate, seq) < 5: failed += 1 if ss.hammingDistance(candidate, ss.reverseComplement(seq)) < 5: # if hamming(candidate, reverseComp(seq)) < 5: failed += 1 return failed
def checkConstraints(candidate, seq_set, hd=5): # check GC content # if candidate.count('G') + candidate.count('C') != 4: # return False # check hamming distance for seq in seq_set: if ss.hammingDistance(candidate, seq) < hd: return False if ss.hammingDistance(candidate, ss.reverseComplement(seq)) < hd: return False return True
def filterRegionSequence( query_seq: str, query_strand: int, transcript_id: str, transcript: dict = None, reference_seq: str = None) -> Tuple[str, bool]: '''Confirm sequence exists in the transcript and return the aligned to the strand direction of the transcript sequence. NOTE: Sometimes there are errors in probes so be sure to validate all sequence lookups!!! Args: query_seq: query_strand: transcript_id: transcript_dict: Default is None. If provided omit lookUp call reference_seq: Default is None. If provided omit getSequence call Returns: Tuple of the form sequence, was_rc sequence corresponding to the query. If transcript_id is provided the sequence will exist in the transcript and get reverse complemented as necessary and was_rc should be checked Raises: ValueError on sequence not found in the target reference sequence ''' was_rc: bool = False query_seq_out: str = query_seq if transcript is None: transcript = lookUpID(transcript_id) if reference_seq is None: reference_seq = getSequence(transcript_id) if transcript['strand'] != query_strand: query_seq_out: str = reverseComplement(query_seq) was_rc = True if query_seq_out not in reference_seq: err: str = "Region sequence not in transcript_id: %s: %d, rc: %s" raise ValueError(err % (transcript_id, query_strand, was_rc)) return query_seq_out, was_rc
def DSeqVH(fwd: str, rev: str = None, overhang: int = None, alphabet: int = AlphaEnum.DNA) -> VirtualHelix: '''Helper function for creating :class:`VirtualHelix` in the style of the :class:`DSeq` with strings ''' dseq: DSeq = DSeq(fwd, rev, overhang, alphabet) overhang: int = dseq.overhang if overhang > 0: fwd_idx_offsets = [overhang] rev_idx_offsets = [0] else: fwd_idx_offsets = [0] rev_idx_offsets = [overhang] oligo_fwd = Oligo(fwd) if rev is None: rev = reverseComplement(fwd) oligo_rev = Oligo(rev) return VirtualHelix([oligo_fwd.strand5p], fwd_idx_offsets, [oligo_rev.strand5p], rev_idx_offsets)
def DSeqVH( fwd: str, rev: str = None, overhang: int = None, alphabet: int = AlphaEnum.DNA) -> VirtualHelix: '''Helper function for creating :class:`VirtualHelix` in the style of the :class:`DSeq` with strings ''' dseq: DSeq = DSeq(fwd, rev, overhang, alphabet) overhang: int = dseq.overhang if overhang > 0: fwd_idx_offsets = [overhang] rev_idx_offsets = [0] else: fwd_idx_offsets = [0] rev_idx_offsets = [overhang] oligo_fwd = Oligo(fwd) if rev is None: rev = reverseComplement(fwd) oligo_rev = Oligo(rev) return VirtualHelix([oligo_fwd.strand5p], fwd_idx_offsets, [oligo_rev.strand5p], rev_idx_offsets)
def __init__(self, fwd: str, rev: str = None, overhang: int = None, is_circular: bool = False, alphabet: int = AlphaEnum.DNA): ''' Args: fwd: ``fwd`` maps to reference in ``ssw-py`` rev: ``rev`` maps to read in ``ssw-py`` overhang: Use to force an alignment. + overhang means the 5' end of the fwd strand is shifted in the 3' direction of the rev strand - overhang means the 5' end of the fwd strand is shifted in the 3' direction of the reverse strand alphabet: whether this is DNA or RNA ''' assert(alphabet in ALPHABETS) self.alphabet: int = alphabet self.fwd: str = fwd self.overhang: int = 0 if rev is None: if overhang is not None: raise ValueError("overhang can't be defined for without a reverse strand") else: self.rev: str = reverseComplement(fwd) else: self.rev: str = rev max_idx_fwd: int = len(fwd) - 1 max_idx_rev: int = len(self.rev) - 1 the_length: int = max(max_idx_fwd, max_idx_rev) + 1 # default self.alignment: Alignment self.rc_rev: str if overhang is None: alignment, self.rc_rev = align_complement(fwd, self.rev) if max_idx_fwd > alignment.reference_end: # positive overhang self.overhang = max_idx_fwd - alignment.reference_end the_length: int = max_idx_fwd + alignment.read_start + 1 elif max_idx_rev > alignment.read_end: # negative overhang self.overhang = alignment.read_end - max_idx_fwd the_length: int = max_idx_rev + alignment.reference_start + 1 self.alignment = alignment else: self.overhang = overhang if overhang < 0: reference_start: int = -overhang read_start: int = 0 delta = min(max_idx_fwd + overhang, max_idx_rev) reference_end: int = reference_start + delta read_end: int = delta the_length: int = max_idx_rev + reference_start + 1 elif overhang > 0: reference_start: int = 0 read_start: int = overhang delta = min(max_idx_fwd, max_idx_rev - read_start) reference_end: int = delta read_end: int = read_start + delta the_length: int = max_idx_fwd + read_start + 1 else: reference_start: int = 0 read_start: int = 0 delta: int = min(max_idx_fwd, max_idx_rev) reference_end: int = delta read_end: int = delta the_length: int = max(max_idx_fwd, max_idx_rev) + 1 self.alignment = Alignment( '', # null 0, # null 0, # null reference_start, reference_end, read_start, read_end ) self.rc_rev = reverseComplement(rev) self.the_length: int = the_length if is_circular: type3, seq3 = self.three_prime_end() type5, seq5 = self.five_prime_end() if ( (type3 != PrimeEnum.BLUNT) and (type5 != PrimeEnum.BLUNT) ): raise ValueError("DNA is_circular but ends can't mate") self.is_circular = is_circular
def test_revComp(self): for x in range(1000): seq = self._randSeq() py_revcomp = reverseComplement(seq) revcomp = seqstr.reverseComplement(seq) self.assertEqual(py_revcomp, revcomp)
def align_complement(fwd: str, rev: str) -> Tuple[Alignment, str]: rc_rev: str = reverseComplement(rev) alignment: Alignment = force_align(rc_rev, fwd) return alignment, rc_rev
def string_align_complement( fwd: str, rev: str, alignment: Alignment = None, rc_rev: str = None, do_print: bool = False, do_highlight: bool = False) -> Tuple[str, str]: if alignment is None: alignment, rc_rev = align_complement(fwd, rev) if rc_rev is None: rc_rev = reverseComplement(rev) reverse_rev: str = reverse(rev) fwd_idx0: int = alignment.reference_start fwd_idx1: int = alignment.reference_end rev_idx0: int = alignment.read_start rev_idx1: int = alignment.read_end max_delta_fwd: int = len(fwd) - fwd_idx0 max_delta_rev: int = len(rev) - rev_idx0 if max_delta_fwd < max_delta_rev: lim_hi: int = max_delta_fwd else: lim_hi: int = max_delta_rev lo_delta: int buffer_fwd: str = '' buffer_rev: str = '' if fwd_idx0 < rev_idx0: lo_delta = fwd_idx0 buffer_fwd = ' '*(rev_idx0 - fwd_idx0) else: lo_delta = rev_idx0 buffer_rev = ' '*(fwd_idx0 - rev_idx0) if do_highlight: highlight_rev_list: List[str] = [] fwd_lo_idx: int = fwd_idx0 - lo_delta rev_lo_idx: int = rev_idx0 - lo_delta total_delta: int = lo_delta + lim_hi for i in range(total_delta): reverse_rev_base = reverse_rev[rev_lo_idx + i] if rc_rev[rev_lo_idx + i] != fwd[fwd_lo_idx + i]: highlight_rev_list.append(reverse_rev_base.lower()) else: highlight_rev_list.append(reverse_rev_base) highlight_unformat_rev: str = ''.join(highlight_rev_list) highlight_rev: str = highlight( highlight_unformat_rev, DNALex(), TermFormatter(style=MisMatchStyle)) out_rev: str = ( buffer_rev + reverse_rev[:rev_lo_idx] + highlight_rev.strip().translate(TRANTAB) + reverse_rev[rev_lo_idx+total_delta:] ) else: out_rev: str = buffer_rev + reverse_rev out_fwd: str = buffer_fwd + fwd if do_print: print(out_fwd) print(out_rev) return out_fwd, out_rev
def checkOffTarget(primer_str, genome_str, primer_idx, params, hamming_percentile=0.05, genome_rc_str=None): """Return the tm, idx, and strand of the strongest off-target hybridization The 5' index of the primer on the fwd strand must be provided for masking purposes. The reverse complement of the genome, ``genome_rc_str``, may be provided as a performance optimization. Under the hood, :func:``checkOffTarget`` calculates the hamming distance between the primer and the respective underlying sequence at each index of the genome and its reverse complement. As a means of optimization, only the bottom ``hamming_percentile`` of hamming distance indices will also be screened with a thermodynamic alignment. For example, a ``hamming_percentile`` of 0.05 will result thermodynamic alignments at the indices of the genome with hamming distances from the ``primer_str`` in the bottom 0.05 percent. Args: primer_str (str) : primer sequence string genome_str (str) : genome sequence string primer_idx (int) : 5'-most index of a primer sequence on the forward strand, used to mask the binding region params (dict) : parameters dictionary used throughout the pipeline (see :module:``mascpcr.pipeline``) hamming_percentile (float, optional) : Hamming distance percentile (0-100) below which regions surrounding the respective indices will be subject to interrogation by thermodynamic alignment genome_rc_str (str, optional) : reverse complement of the genome (optimization to minimize the number of times this operation must be performed / number of memory copies) Returns: The tm (deg. C), index, and strand of the strongest off-target hybridization. Raises: None """ genome_rc_str = genome_rc_str or seqstr.reverseComplement(genome_str) primer_length = len(primer_str) strand_results = mp.Queue() def _fwdStrand(): fwd_hamming_distances = seqstr.rollingHammingDistance(primer_str, genome_rc_str) fwd_hd_thresh = np.percentile(fwd_hamming_distances, hamming_percentile) fwd_primer_footprint = (-(primer_idx+primer_length), (-primer_idx)) fwd_hamming_distances[fwd_primer_footprint[0]: \ fwd_primer_footprint[1]] = primer_length fwd_hotspots, = np.where((fwd_hamming_distances < fwd_hd_thresh)) highest_tm_idx = None highest_tm = -100 for idx in fwd_hotspots: tm = primer3.calcHeterodimerTm( primer_str, genome_str[-(idx+primer_length):-idx], **params['thermo_params']) if tm > highest_tm: highest_tm_idx = idx highest_tm = tm strand_results.put((highest_tm, highest_tm_idx, 1)) def _revStrand(): rev_hamming_distances = seqstr.rollingHammingDistance(primer_str, genome_str) rev_hd_thresh = np.percentile(rev_hamming_distances, hamming_percentile) rev_primer_footprint = ((primer_idx), (primer_idx+primer_length)) rev_hamming_distances[rev_primer_footprint[0]: \ rev_primer_footprint[1]] = primer_length rev_hotspots, = np.where((rev_hamming_distances < rev_hd_thresh)) highest_tm_idx = None highest_tm = -100 for idx in rev_hotspots: tm = primer3.calcHeterodimerTm( primer_str, genome_rc_str[idx:idx+primer_length], **params['thermo_params']) if tm > highest_tm: highest_tm_idx = idx highest_tm = tm strand_results.put((highest_tm, highest_tm_idx, 0)) fwd_proc = mp.Process(target=_fwdStrand) rev_proc = mp.Process(target=_revStrand) fwd_proc.start() rev_proc.start() fwd_proc.join() rev_proc.join() res1 = strand_results.get() res2 = strand_results.get() return max(res1[0], res2[0])
def __init__(self, fwd: str, rev: str = None, overhang: int = None, is_circular: bool = False, alphabet: int = AlphaEnum.DNA): ''' Args: fwd: ``fwd`` maps to reference in ``ssw-py`` rev: ``rev`` maps to read in ``ssw-py`` overhang: Use to force an alignment. + overhang means the 5' end of the fwd strand is shifted in the 3' direction of the rev strand - overhang means the 5' end of the fwd strand is shifted in the 3' direction of the reverse strand alphabet: whether this is DNA or RNA ''' assert (alphabet in ALPHABETS) self.alphabet: int = alphabet self.fwd: str = fwd self.overhang: int = 0 if rev is None: if overhang is not None: raise ValueError( "overhang can't be defined for without a reverse strand") else: self.rev: str = reverseComplement(fwd) else: self.rev: str = rev max_idx_fwd: int = len(fwd) - 1 max_idx_rev: int = len(self.rev) - 1 the_length: int = max(max_idx_fwd, max_idx_rev) + 1 # default self.alignment: Alignment self.rc_rev: str if overhang is None: alignment, self.rc_rev = align_complement(fwd, self.rev) if max_idx_fwd > alignment.reference_end: # positive overhang self.overhang = max_idx_fwd - alignment.reference_end the_length: int = max_idx_fwd + alignment.read_start + 1 elif max_idx_rev > alignment.read_end: # negative overhang self.overhang = alignment.read_end - max_idx_fwd the_length: int = max_idx_rev + alignment.reference_start + 1 self.alignment = alignment else: self.overhang = overhang if overhang < 0: reference_start: int = -overhang read_start: int = 0 delta = min(max_idx_fwd + overhang, max_idx_rev) reference_end: int = reference_start + delta read_end: int = delta the_length: int = max_idx_rev + reference_start + 1 elif overhang > 0: reference_start: int = 0 read_start: int = overhang delta = min(max_idx_fwd, max_idx_rev - read_start) reference_end: int = delta read_end: int = read_start + delta the_length: int = max_idx_fwd + read_start + 1 else: reference_start: int = 0 read_start: int = 0 delta: int = min(max_idx_fwd, max_idx_rev) reference_end: int = delta read_end: int = delta the_length: int = max(max_idx_fwd, max_idx_rev) + 1 self.alignment = Alignment( '', # null 0, # null 0, # null reference_start, reference_end, read_start, read_end) self.rc_rev = reverseComplement(rev) self.the_length: int = the_length if is_circular: type3, seq3 = self.three_prime_end() type5, seq5 = self.five_prime_end() if ((type3 != PrimeEnum.BLUNT) and (type5 != PrimeEnum.BLUNT)): raise ValueError("DNA is_circular but ends can't mate") self.is_circular = is_circular