def _remove_rec_sites(seq, enzymes=None): '''It modifies all rec sites in the sequence to be able to use with goldenbraid pipeline''' if enzymes is None: enzymes = MANDATORY_DOMEST_ENZYMES rec_sites = get_ret_sites(enzymes) # regex with the sites to domesticate rec_sites_regex = '(' + '|'.join(rec_sites) + ')' rec_sites_regex = re.compile(rec_sites_regex, flags=re.IGNORECASE) rec_sites_in_seq = [] fragments = [] for splitted_part in rec_sites_regex.split(str(seq)): if rec_sites_regex.match(splitted_part): rec_sites_in_seq.append(splitted_part) else: fragments.append(splitted_part) new_seq = Seq('', alphabet=generic_dna) # we can not convert a rec site in another rec site _cumulative_patch = '' # it is only used to know the frame rec_site_pairs = [] for fragment, rec_site_in_seq in izip_longest(fragments, rec_sites_in_seq): new_seq += fragment if rec_site_in_seq is not None: _cumulative_patch += fragment + rec_site_in_seq new_rec_site = _domesticate_rec_site(rec_site_in_seq, _cumulative_patch, rec_sites_regex) rec_site_pairs.append({'original': rec_site_in_seq, 'modified': new_rec_site}) new_seq += new_rec_site coding_seq = Seq(_get_upper_nucls(seq)) new_coding_seq = Seq(_get_upper_nucls(new_seq)) if str(coding_seq.translate()) != str(new_coding_seq.translate()): msg = 'The generated sequence does not produce the same peptide' raise ValueError(msg) if rec_sites_regex.search(str(new_seq)): msg = 'Not all rec_sites modified' raise ValueError(msg) return new_seq, rec_site_pairs, fragments
def num_rec_sites(self): rec_sites = get_ret_sites(ENZYMES_USED_IN_GOLDENBRAID) # regex with the sites to domesticate rec_sites_regex = '(' + '|'.join(rec_sites) + ')' rec_sites_regex = re.compile(rec_sites_regex) return len(re.findall(rec_sites_regex, self.residues))