def enzyme_selector(sequence, restriction_interval, genome_frequency=False, deterministic_overhangs=False, rb=False): """ Usage Example: from utils import extract_feature sequence,_ = extract_feature(sequence_id="AJ627603", data_dir="/home/chymera/data2/gt.ep/sequences/", feature_names=["Cre", "cre", "CRE"]) outp = enzyme_selector(sequence=sequence, restriction_interval=[0,690], genome_frequency=[700,2000], deterministic_overhangs=True) print outp """ from Bio.Restriction import Analysis, AllEnzymes, RestrictionBatch if not rb: basic_analysis = Analysis(AllEnzymes, sequence.seq) else: basic_analysis = Analysis(rb, sequence.seq) respect_target = basic_analysis.only_between(restriction_interval[0],restriction_interval[1]) # print respect_target if genome_frequency: respect_frequency = respect_target for enzyme, item in respect_target.items(): if enzyme.frequency() < genome_frequency[0] or enzyme.frequency() > genome_frequency[1]: del respect_frequency[enzyme] else: if deterministic_overhangs: from utils import overhangs if any(bp_ID in overhangs(enzyme) for bp_ID in ["N", "R", "Y", "!!!", "S", "W", "M", "K", "B", "D", "H", "V"]) or overhangs(enzyme) == "": del respect_frequency[enzyme] return respect_frequency
def remove_restriction_sites(dna_sequence, restrict_sites): logger.info("===== REMOVE RESTRICTION SITES =====") # check each unwanted restriction site analysis = Analysis(restrictionbatch=restrict_sites, sequence=dna_sequence) result = analysis.full() mutable_seq = dna_sequence.tomutable() for enz, cuts in result.items(): for cut in cuts: logger.info( "The restriction enzyme {0} can cut the sequence before position {1}!" .format(str(enz), cuts)) # map sequence position to codon position # subtract 1 from `cut` to convert from sequence to string indexing codon_pos, offset = divmod((cut - 1) - (enz.size // 2), 3) # ensure the whole codon we mutate is being recognized by the restriction enzyme if offset: codon_pos += 1 codon_idx = slice(codon_pos * 3, (codon_pos + 1) * 3) new_codon = mutate_codon(mutable_seq[codon_idx], codon_use_table) mutable_seq[codon_idx] = new_codon return mutable_seq.toseq()
def _cap_enzymes_between_alleles(allele1, allele2, reference, location, all_enzymes=False): '''It looks in the enzymes that differenciate the given alleles. It returns a set. ''' kind1 = allele1['kind'] kind2 = allele2['kind'] allele1 = allele1['allele'] allele2 = allele2['allele'] #we have to build the two sequences if all_enzymes: restriction_batch = CommOnly else: restriction_batch = RestrictionBatch(COMMON_ENZYMES) seq1 = create_alleles('seq1', allele1, kind1, reference, location) seq2 = create_alleles('seq2', allele2, kind2, reference, location) anal1 = Analysis(restriction_batch, seq1, linear=True) enzymes1 = set(anal1.with_sites().keys()) anal1 = Analysis(restriction_batch, seq2, linear=True) enzymes2 = set(anal1.with_sites().keys()) enzymes = set(enzymes1).symmetric_difference(set(enzymes2)) return enzymes
def test_print_that(self): """Test print_that function.""" out = self.StringIO() self.sys.stdout = out my_batch = EcoRI + SmaI + KpnI my_seq = Seq("GAATTCCCGGGATATA") # EcoRI and SmaI sites analysis = Analysis(my_batch, my_seq) analysis.print_that(None, title="My sequence\n\n", s1="Non Cutters\n\n") self.assertIn("My sequence", out.getvalue()) self.assertIn("Non Cutters", out.getvalue()) self.assertIn("2.", out.getvalue()) self.sys.stdout = self.sys.__stdout__
def test_print_that(self): """Test print_that function.""" out = self.StringIO() self.sys.stdout = out my_batch = EcoRI + SmaI + KpnI my_seq = Seq('GAATTCCCGGGATATA') # EcoRI and SmaI sites analysis = Analysis(my_batch, my_seq) analysis.print_that(None, title='My sequence\n\n', s1='Non Cutters\n\n') self.assertIn('My sequence', out.getvalue()) self.assertIn('Non Cutters', out.getvalue()) self.assertIn('2.', out.getvalue()) self.sys.stdout = self.sys.__stdout__
def eval_restriction_sites(individual, restrict_sites): """ TODO: Make it remove rest sites """ assert (individual is SequenceContainer) sequence = getattr(individual, "sequence") # check unwanted restriction sites analysis = Analysis(restrictionbatch=restrict_sites, sequence=sequence) result = analysis.full() # score the sequence based on the number of restriction sites score = 0 for enz, cuts in result.items(): for cut in cuts: score += 1 return score
def test_change(self): """Test that change() changes something.""" seq = Seq('CCAGTCTATAATTCG' + BamHI.site + 'GCGGCATCATACTCGA' + BamHI.site + 'ATATCGCGTGATGATA' + EcoRV.site + 'CGTAGTAATTACGCATG') batch = NdeI + EcoRI + BamHI + BsmBI analysis = Analysis(batch, seq) self.assertEqual(analysis.full()[BamHI], [17, 39]) batch = NdeI + EcoRI + BsmBI seq += NdeI.site analysis.change(sequence=seq) analysis.change(rb=batch) self.assertEqual(len(analysis.full()), 3) self.assertEqual(analysis.full()[NdeI], [85]) with self.assertRaises(AttributeError): analysis.change(**{'NameWidth': 3, 'KonsoleWidth': 40}) # Console
def remove_restriction_sites(dna_sequence, codon_use_table, restrict_sites): """Identify and remove seuences recognized by a set of restriction enzymes. Args: dna_sequence (Bio.Seq.Seq): A read-only representation of the DNA sequence. codon_use_table (dict{str, list[list, list]}): A dictionary with each amino acid three-letter code as keys, and a list of two lists as values. The first list is the synonymous codons that encode the amino acid, the second is the frequency with which each synonymouscodon is used. restrict_sites (Bio.Restriction.RestrictionBatch): RestrictionBatch instance configured with the input restriction enzymes. Returns: Bio.Seq.Seq: A read-only representation of the new DNA sequence. """ logger.info("Removing restriction sites") # check each unwanted restriction site analysis = Analysis(restrictionbatch=restrict_sites, sequence=dna_sequence) result = analysis.full() mutable_seq = dna_sequence.tomutable() for enz, cuts in result.items(): for cut in cuts: logger.info( "Restriction enzyme ({}) cut site detected at position {}.". format(str(enz), cuts)) # map sequence position to codon position # subtract 1 from `cut` to convert from sequence to string indexing codon_pos, offset = divmod((cut - 1) - (enz.size // 2), 3) # ensure the whole codon we mutate is being recognized by the restriction enzyme if offset: codon_pos += 1 codon_idx = slice(codon_pos * 3, (codon_pos + 1) * 3) new_codon = mutate_codon(mutable_seq[codon_idx], codon_use_table) mutable_seq[codon_idx] = new_codon return mutable_seq.toseq()
def test_analysis_restrictions(self): """Test Fancier restriction analysis.""" new_seq = Seq('TTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA', IUPACAmbiguousDNA()) rb = RestrictionBatch([EcoRI, KpnI, EcoRV]) ana = Analysis(rb, new_seq, linear=False) # Output only the result for enzymes which cut blunt: self.assertEqual(ana.blunt(), {EcoRV: []}) self.assertEqual(ana.full(), {KpnI: [], EcoRV: [], EcoRI: [33]}) # Output only the result for enzymes which have a site: self.assertEqual(ana.with_sites(), {EcoRI: [33]}) # Output only the enzymes which have no site: self.assertEqual(ana.without_site(), {KpnI: [], EcoRV: []}) self.assertEqual(ana.with_site_size([32]), {}) # Output only enzymes which produce 5' overhangs self.assertEqual(ana.overhang5(), {EcoRI: [33]}) # Output only enzymes which produce 3' overhangs self.assertEqual(ana.overhang3(), {KpnI: []}) # Output only enzymes which produce defined ends self.assertEqual(ana.defined(), {KpnI: [], EcoRV: [], EcoRI: [33]}) # Output only enzymes hich cut N times self.assertEqual(ana.with_N_sites(2), {}) # The enzymes which cut between position x and y: with self.assertRaises(TypeError): ana.only_between('t', 20) with self.assertRaises(TypeError): ana.only_between(1, 't') self.assertEqual(ana.only_between(1, 20), {}) self.assertEqual(ana.only_between(20, 34), {EcoRI: [33]}) # Mix start/end order: self.assertEqual(ana.only_between(34, 20), {EcoRI: [33]}) self.assertEqual(ana.only_outside(20, 34), {}) with self.assertWarns(BiopythonWarning): ana.with_name(['fake']) self.assertEqual(ana.with_name([EcoRI]), {EcoRI: [33]}) self.assertEqual((ana._boundaries(1, 20)[:2]), (1, 20)) # Reverse order: self.assertEqual((ana._boundaries(20, 1)[:2]), (1, 20)) # Fix negative start: self.assertEqual((ana._boundaries(-1, 20)[:2]), (20, 33)) # Fix negative end: self.assertEqual((ana._boundaries(1, -1)[:2]), (1, 33)) # Sites in- and outside of boundaries new_seq = Seq('GAATTCAAAAAAGAATTC', IUPACAmbiguousDNA()) rb = RestrictionBatch([EcoRI]) ana = Analysis(rb, new_seq) # Cut at least inside self.assertEqual(ana.between(1, 7), {EcoRI: [2, 14]}) # Cut at least inside and report only inside site self.assertEqual(ana.show_only_between(1, 7), {EcoRI: [2]}) # Cut at least outside self.assertEqual(ana.outside(1, 7), {EcoRI: [2, 14]}) # Don't cut within self.assertEqual(ana.do_not_cut(7, 12), {EcoRI: [2, 14]})
def findRestrictionSites(sequence, restr_batch): mySeq = Seq(sequence, IUPACAmbiguousDNA()) rb = RestrictionBatch(restr_batch) analyze = Analysis(rb, mySeq) return analyze.full()
def get_restriction_table(seq, enzyme, circular=False): """ Get the restriction table for a single genomic sequence. Parameters ---------- seq : Seq object A biopython Seq object representing a chromosomes or contig. enzyme : int, str or list of str The name of the restriction enzyme used, or a list of restriction enzyme names. Can also be an integer, to digest by fixed chunk size. circular : bool Wether the genome is circular. Returns ------- numpy.array: List of restriction fragment boundary positions for the input sequence. >>> from Bio.Seq import Seq >>> get_restriction_table(Seq("AAGATCGATCGG"),"DpnII") array([ 0, 2, 6, 12]) >>> get_restriction_table(Seq("AA"),["DpnII", "HinfI"]) array([0, 2]) >>> get_restriction_table(Seq("AA"),"aeiou1") Traceback (most recent call last): ... ValueError: aeiou1 is not a valid restriction enzyme. >>> get_restriction_table("AA","DpnII") Traceback (most recent call last): ... TypeError: Expected Seq or MutableSeq instance, got <class 'str'> instead """ chrom_len = len(seq) wrong_enzyme = "{} is not a valid restriction enzyme.".format(enzyme) # Restriction batch containing the restriction enzyme try: enz = [enzyme] if isinstance(enzyme, str) else enzyme cutter = RestrictionBatch(enz) except (TypeError, ValueError): try: cutter = max(int(enzyme), DEFAULT_MIN_CHUNK_SIZE) except ValueError: raise ValueError(wrong_enzyme) # Conversion from string type to restriction type if isinstance(cutter, int): sites = [i for i in range(0, chrom_len, cutter)] if sites[-1] < chrom_len: sites.append(chrom_len) else: # Find sites of all restriction enzymes given ana = Analysis(cutter, seq, linear=not circular) sites = ana.full() # Gets all sites into a single flat list with 0-based index sites = [site - 1 for enz in sites.values() for site in enz] # Sort by position and allow first add start and end of seq sites.sort() sites.insert(0, 0) sites.append(chrom_len) return np.array(sites)
def test_analysis_restrictions(self): """Test Fancier restriction analysis """ new_seq = Seq('TTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA', IUPACAmbiguousDNA()) rb = RestrictionBatch([EcoRI, KpnI, EcoRV]) ana = Analysis(rb, new_seq, linear=False) self.assertEqual( ana.blunt(), {EcoRV: []}) # output only the result for enzymes which cut blunt self.assertEqual(ana.full(), {KpnI: [], EcoRV: [], EcoRI: [33]}) self.assertEqual( ana.with_sites(), {EcoRI: [33] }) # output only the result for enzymes which have a site self.assertEqual(ana.without_site(), { KpnI: [], EcoRV: [] }) # output only the enzymes which have no site self.assertEqual(ana.with_site_size([32]), {}) self.assertEqual(ana.only_between(1, 20), {}) # the enzymes which cut between position 1 and 20 self.assertEqual(ana.only_between(20, 34), {EcoRI: [33]}) # etc... self.assertEqual(ana.only_between(34, 20), {EcoRI: [33]}) # mix start end order self.assertEqual(ana.only_outside(20, 34), {}) with self.assertWarns(BiopythonWarning): ana.with_name(['fake']) self.assertEqual(ana.with_name([EcoRI]), {EcoRI: [33]}) self.assertEqual((ana._boundaries(1, 20)[:2]), (1, 20)) self.assertEqual((ana._boundaries(20, 1)[:2]), (1, 20)) # reverse order self.assertEqual((ana._boundaries(-1, 20)[:2]), (20, 33)) # fix negative start
def run_p(self): global userChoices, enzymes, fastaRead sh.log("\nstart run_p") sh.click() self.restrictResults.clear() self.numeralResults.clear() if not fastaRead: self.restrictResults.setPlainText( "You must select a fasta file first") return if len(userChoices) <= 0: self.restrictResults.setPlainText( "You must select R.Enzymes first") return self.detectPushButton.setEnabled(False) # can't run twice try: linear = self.linearCheckBox.isChecked() analysis = Analysis(userChoices, self.sequence, linear=linear) except: sh.log("analysis failed " + sys.exc_info()[0]) # print each enzyme with a list of it's matching sites cutSites = str( analysis.format_output( dct=None, title='', s1='\n Enzymes which do not cut the sequence\n')) self.restrictResults.setPlainText(cutSites) # ------------------------------- FIND PALINDROME HIT COUNTS ----------------------------------------------- try: endMarker = "END" enzymes.append(endMarker) # Extract enzymes and the index of their cutSites from cutSites palin = cutSites[:cutSites.find("Enzymes")].replace( '.', "").replace(':', "").split() palin.append(endMarker) sh.log("palin: " + str(palin)) except: sh.log("palin NG " + sys.exec_info()[0]) try: # Calculate and display the number of matching sites for each enzyme # enzPosn initally has a list of lists. Each sublist has the enzyme name # and the index of the enzyme in palin # enzPosn sublist later has the enzyme name and the number of matches. enzPosn = [] enzNone = [] sh.log("len palin " + str(len(palin))) sh.log("user choices " + str(userChoices)) allChoices = userChoices allChoices.append(endMarker) # matches last name in palin sh.log("allChoices " + str(allChoices)) for enz in allChoices: if enz in palin: enzPosn.append([enz, palin.index(enz)]) else: sh.log(enz + " not in palin") enzNone.append(enz) sh.log("enzPosn = " + str(enzPosn)) enzPosn.sort(key=lambda x: x[1]) # sort on index of name in palin for i in range(len(enzPosn) - 1): # Replace the index with the enzPosn[i][1] = enzPosn[ i + 1][1] - enzPosn[i][1] - 1 # length of palin entry del enzPosn[-1] # delete endMarker for enz in enzNone: enzPosn.append([enz, 0]) # add in enzymes not found; length = 0 enzPosn.sort(key=lambda x: x[0]) # sort on name sh.log("enzPosn = " + str(enzPosn)) for i in range(len( enzPosn)): # show the number of matches for each enzyme matchStr = "{0:7,d} : {1:s}\n\n".format( enzPosn[i][1], enzPosn[i][0]) self.numeralResults.insertPlainText(matchStr) except: sh.log('I cannot do that. ' + sys.exec_info()[0]) self.detectPushButton.setEnabled(False) self.nPosPushButton.setEnabled(True)