Ejemplo n.º 1
0
def enzyme_selector(sequence, restriction_interval, genome_frequency=False, deterministic_overhangs=False, rb=False):
	"""
	Usage Example:
		from utils import extract_feature
		sequence,_ = extract_feature(sequence_id="AJ627603", data_dir="/home/chymera/data2/gt.ep/sequences/", feature_names=["Cre", "cre", "CRE"])
		outp = enzyme_selector(sequence=sequence, restriction_interval=[0,690], genome_frequency=[700,2000], deterministic_overhangs=True)
		print outp
	"""


	from Bio.Restriction import Analysis, AllEnzymes, RestrictionBatch

	if not rb:
		basic_analysis = Analysis(AllEnzymes, sequence.seq)
	else:
		basic_analysis = Analysis(rb, sequence.seq)
	respect_target = basic_analysis.only_between(restriction_interval[0],restriction_interval[1])
	# print respect_target

	if genome_frequency:
		respect_frequency = respect_target
	 	for enzyme, item in respect_target.items():
			if enzyme.frequency() < genome_frequency[0] or enzyme.frequency() > genome_frequency[1]:
				del respect_frequency[enzyme]
			else:
				if deterministic_overhangs:
					from utils import overhangs
					if any(bp_ID in overhangs(enzyme) for bp_ID in ["N", "R", "Y", "!!!", "S", "W", "M", "K", "B", "D", "H", "V"]) or overhangs(enzyme) == "":
						del respect_frequency[enzyme]

	return respect_frequency
Ejemplo n.º 2
0
def remove_restriction_sites(dna_sequence, restrict_sites):
    logger.info("===== REMOVE RESTRICTION SITES =====")

    # check each unwanted restriction site
    analysis = Analysis(restrictionbatch=restrict_sites, sequence=dna_sequence)
    result = analysis.full()

    mutable_seq = dna_sequence.tomutable()
    for enz, cuts in result.items():
        for cut in cuts:
            logger.info(
                "The restriction enzyme {0} can cut the sequence before position {1}!"
                .format(str(enz), cuts))
            # map sequence position to codon position
            # subtract 1 from `cut` to convert from sequence to string indexing
            codon_pos, offset = divmod((cut - 1) - (enz.size // 2), 3)

            # ensure the whole codon we mutate is being recognized by the restriction enzyme
            if offset:
                codon_pos += 1
            codon_idx = slice(codon_pos * 3, (codon_pos + 1) * 3)

            new_codon = mutate_codon(mutable_seq[codon_idx], codon_use_table)
            mutable_seq[codon_idx] = new_codon

    return mutable_seq.toseq()
Ejemplo n.º 3
0
def _cap_enzymes_between_alleles(allele1, allele2, reference, location,
                                 all_enzymes=False):
    '''It looks in the enzymes that differenciate the given alleles.

    It returns a set.
    '''
    kind1 = allele1['kind']
    kind2 = allele2['kind']
    allele1 = allele1['allele']
    allele2 = allele2['allele']

    #we have to build the two sequences
    if all_enzymes:
        restriction_batch = CommOnly
    else:
        restriction_batch = RestrictionBatch(COMMON_ENZYMES)

    seq1 = create_alleles('seq1', allele1, kind1, reference, location)
    seq2 = create_alleles('seq2', allele2, kind2, reference, location)

    anal1 = Analysis(restriction_batch, seq1, linear=True)
    enzymes1 = set(anal1.with_sites().keys())
    anal1 = Analysis(restriction_batch, seq2, linear=True)
    enzymes2 = set(anal1.with_sites().keys())

    enzymes = set(enzymes1).symmetric_difference(set(enzymes2))

    return enzymes
Ejemplo n.º 4
0
 def test_print_that(self):
     """Test print_that function."""
     out = self.StringIO()
     self.sys.stdout = out
     my_batch = EcoRI + SmaI + KpnI
     my_seq = Seq("GAATTCCCGGGATATA")  # EcoRI and SmaI sites
     analysis = Analysis(my_batch, my_seq)
     analysis.print_that(None, title="My sequence\n\n", s1="Non Cutters\n\n")
     self.assertIn("My sequence", out.getvalue())
     self.assertIn("Non Cutters", out.getvalue())
     self.assertIn("2.", out.getvalue())
     self.sys.stdout = self.sys.__stdout__
Ejemplo n.º 5
0
 def test_print_that(self):
     """Test print_that function."""
     out = self.StringIO()
     self.sys.stdout = out
     my_batch = EcoRI + SmaI + KpnI
     my_seq = Seq('GAATTCCCGGGATATA')  # EcoRI and SmaI sites
     analysis = Analysis(my_batch, my_seq)
     analysis.print_that(None, title='My sequence\n\n',
                         s1='Non Cutters\n\n')
     self.assertIn('My sequence', out.getvalue())
     self.assertIn('Non Cutters', out.getvalue())
     self.assertIn('2.', out.getvalue())
     self.sys.stdout = self.sys.__stdout__
Ejemplo n.º 6
0
 def test_print_that(self):
     """Test print_that function."""
     out = self.StringIO()
     self.sys.stdout = out
     my_batch = EcoRI + SmaI + KpnI
     my_seq = Seq('GAATTCCCGGGATATA')  # EcoRI and SmaI sites
     analysis = Analysis(my_batch, my_seq)
     analysis.print_that(None, title='My sequence\n\n',
                         s1='Non Cutters\n\n')
     self.assertIn('My sequence', out.getvalue())
     self.assertIn('Non Cutters', out.getvalue())
     self.assertIn('2.', out.getvalue())
     self.sys.stdout = self.sys.__stdout__
Ejemplo n.º 7
0
def eval_restriction_sites(individual, restrict_sites):
    """
    TODO: Make it remove rest sites
    """
    assert (individual is SequenceContainer)
    sequence = getattr(individual, "sequence")
    # check unwanted restriction sites
    analysis = Analysis(restrictionbatch=restrict_sites, sequence=sequence)
    result = analysis.full()
    # score the sequence based on the number of restriction sites
    score = 0
    for enz, cuts in result.items():
        for cut in cuts:
            score += 1
    return score
Ejemplo n.º 8
0
 def test_change(self):
     """Test that change() changes something."""
     seq = Seq('CCAGTCTATAATTCG' + BamHI.site +
               'GCGGCATCATACTCGA' + BamHI.site +
               'ATATCGCGTGATGATA' + EcoRV.site +
               'CGTAGTAATTACGCATG')
     batch = NdeI + EcoRI + BamHI + BsmBI
     analysis = Analysis(batch, seq)
     self.assertEqual(analysis.full()[BamHI], [17, 39])
     batch = NdeI + EcoRI + BsmBI
     seq += NdeI.site
     analysis.change(sequence=seq)
     analysis.change(rb=batch)
     self.assertEqual(len(analysis.full()), 3)
     self.assertEqual(analysis.full()[NdeI], [85])
     with self.assertRaises(AttributeError):
         analysis.change(**{'NameWidth': 3, 'KonsoleWidth': 40})  # Console
Ejemplo n.º 9
0
def remove_restriction_sites(dna_sequence, codon_use_table, restrict_sites):
    """Identify and remove seuences recognized by a set of restriction
    enzymes.

    Args:
        dna_sequence (Bio.Seq.Seq): A read-only representation of
            the DNA sequence.
        codon_use_table (dict{str, list[list, list]}): A dictionary with
            each amino acid three-letter code as keys, and a list of two
            lists as values. The first list is the synonymous codons that
            encode the amino acid, the second is the frequency with which
            each synonymouscodon is used.
        restrict_sites (Bio.Restriction.RestrictionBatch): RestrictionBatch
            instance configured with the input restriction enzymes.

    Returns:
        Bio.Seq.Seq: A read-only representation of the new DNA sequence.
    """

    logger.info("Removing restriction sites")

    # check each unwanted restriction site
    analysis = Analysis(restrictionbatch=restrict_sites, sequence=dna_sequence)
    result = analysis.full()

    mutable_seq = dna_sequence.tomutable()
    for enz, cuts in result.items():
        for cut in cuts:
            logger.info(
                "Restriction enzyme ({}) cut site detected at position {}.".
                format(str(enz), cuts))
            # map sequence position to codon position
            # subtract 1 from `cut` to convert from sequence to string indexing
            codon_pos, offset = divmod((cut - 1) - (enz.size // 2), 3)

            # ensure the whole codon we mutate is being recognized by the restriction enzyme
            if offset:
                codon_pos += 1
            codon_idx = slice(codon_pos * 3, (codon_pos + 1) * 3)

            new_codon = mutate_codon(mutable_seq[codon_idx], codon_use_table)
            mutable_seq[codon_idx] = new_codon

    return mutable_seq.toseq()
Ejemplo n.º 10
0
 def test_analysis_restrictions(self):
     """Test Fancier restriction analysis."""
     new_seq = Seq('TTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA',
                   IUPACAmbiguousDNA())
     rb = RestrictionBatch([EcoRI, KpnI, EcoRV])
     ana = Analysis(rb, new_seq, linear=False)
     # Output only the result for enzymes which cut blunt:
     self.assertEqual(ana.blunt(), {EcoRV: []})
     self.assertEqual(ana.full(), {KpnI: [], EcoRV: [], EcoRI: [33]})
     # Output only the result for enzymes which have a site:
     self.assertEqual(ana.with_sites(), {EcoRI: [33]})
     # Output only the enzymes which have no site:
     self.assertEqual(ana.without_site(), {KpnI: [], EcoRV: []})
     self.assertEqual(ana.with_site_size([32]), {})
     # Output only enzymes which produce 5' overhangs
     self.assertEqual(ana.overhang5(), {EcoRI: [33]})
     # Output only enzymes which produce 3' overhangs
     self.assertEqual(ana.overhang3(), {KpnI: []})
     # Output only enzymes which produce defined ends
     self.assertEqual(ana.defined(), {KpnI: [], EcoRV: [], EcoRI: [33]})
     # Output only enzymes hich cut N times
     self.assertEqual(ana.with_N_sites(2), {})
     # The enzymes which cut between position x and y:
     with self.assertRaises(TypeError):
         ana.only_between('t', 20)
     with self.assertRaises(TypeError):
         ana.only_between(1, 't')
     self.assertEqual(ana.only_between(1, 20), {})
     self.assertEqual(ana.only_between(20, 34), {EcoRI: [33]})
     # Mix start/end order:
     self.assertEqual(ana.only_between(34, 20), {EcoRI: [33]})
     self.assertEqual(ana.only_outside(20, 34), {})
     with self.assertWarns(BiopythonWarning):
         ana.with_name(['fake'])
     self.assertEqual(ana.with_name([EcoRI]), {EcoRI: [33]})
     self.assertEqual((ana._boundaries(1, 20)[:2]), (1, 20))
     # Reverse order:
     self.assertEqual((ana._boundaries(20, 1)[:2]), (1, 20))
     # Fix negative start:
     self.assertEqual((ana._boundaries(-1, 20)[:2]), (20, 33))
     # Fix negative end:
     self.assertEqual((ana._boundaries(1, -1)[:2]), (1, 33))
     # Sites in- and outside of boundaries
     new_seq = Seq('GAATTCAAAAAAGAATTC', IUPACAmbiguousDNA())
     rb = RestrictionBatch([EcoRI])
     ana = Analysis(rb, new_seq)
     # Cut at least inside
     self.assertEqual(ana.between(1, 7), {EcoRI: [2, 14]})
     # Cut at least inside and report only inside site
     self.assertEqual(ana.show_only_between(1, 7), {EcoRI: [2]})
     # Cut at least outside
     self.assertEqual(ana.outside(1, 7), {EcoRI: [2, 14]})
     # Don't cut within
     self.assertEqual(ana.do_not_cut(7, 12), {EcoRI: [2, 14]})
Ejemplo n.º 11
0
 def test_change(self):
     """Test that change() changes something."""
     seq = Seq('CCAGTCTATAATTCG' + BamHI.site + 'GCGGCATCATACTCGA' +
               BamHI.site + 'ATATCGCGTGATGATA' + EcoRV.site +
               'CGTAGTAATTACGCATG')
     batch = NdeI + EcoRI + BamHI + BsmBI
     analysis = Analysis(batch, seq)
     self.assertEqual(analysis.full()[BamHI], [17, 39])
     batch = NdeI + EcoRI + BsmBI
     seq += NdeI.site
     analysis.change(sequence=seq)
     analysis.change(rb=batch)
     self.assertEqual(len(analysis.full()), 3)
     self.assertEqual(analysis.full()[NdeI], [85])
     with self.assertRaises(AttributeError):
         analysis.change(**{'NameWidth': 3, 'KonsoleWidth': 40})  # Console
Ejemplo n.º 12
0
 def test_analysis_restrictions(self):
     """Test Fancier restriction analysis."""
     new_seq = Seq('TTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA',
                   IUPACAmbiguousDNA())
     rb = RestrictionBatch([EcoRI, KpnI, EcoRV])
     ana = Analysis(rb, new_seq, linear=False)
     # Output only the result for enzymes which cut blunt:
     self.assertEqual(ana.blunt(), {EcoRV: []})
     self.assertEqual(ana.full(), {KpnI: [], EcoRV: [], EcoRI: [33]})
     # Output only the result for enzymes which have a site:
     self.assertEqual(ana.with_sites(), {EcoRI: [33]})
     # Output only the enzymes which have no site:
     self.assertEqual(ana.without_site(), {KpnI: [], EcoRV: []})
     self.assertEqual(ana.with_site_size([32]), {})
     # Output only enzymes which produce 5' overhangs
     self.assertEqual(ana.overhang5(), {EcoRI: [33]})
     # Output only enzymes which produce 3' overhangs
     self.assertEqual(ana.overhang3(), {KpnI: []})
     # Output only enzymes which produce defined ends
     self.assertEqual(ana.defined(), {KpnI: [], EcoRV: [], EcoRI: [33]})
     # Output only enzymes hich cut N times
     self.assertEqual(ana.with_N_sites(2), {})
     # The enzymes which cut between position x and y:
     with self.assertRaises(TypeError):
         ana.only_between('t', 20)
     with self.assertRaises(TypeError):
         ana.only_between(1, 't')
     self.assertEqual(ana.only_between(1, 20), {})
     self.assertEqual(ana.only_between(20, 34), {EcoRI: [33]})
     # Mix start/end order:
     self.assertEqual(ana.only_between(34, 20), {EcoRI: [33]})
     self.assertEqual(ana.only_outside(20, 34), {})
     with self.assertWarns(BiopythonWarning):
         ana.with_name(['fake'])
     self.assertEqual(ana.with_name([EcoRI]), {EcoRI: [33]})
     self.assertEqual((ana._boundaries(1, 20)[:2]), (1, 20))
     # Reverse order:
     self.assertEqual((ana._boundaries(20, 1)[:2]), (1, 20))
     # Fix negative start:
     self.assertEqual((ana._boundaries(-1, 20)[:2]), (20, 33))
     # Fix negative end:
     self.assertEqual((ana._boundaries(1, -1)[:2]), (1, 33))
     # Sites in- and outside of boundaries
     new_seq = Seq('GAATTCAAAAAAGAATTC', IUPACAmbiguousDNA())
     rb = RestrictionBatch([EcoRI])
     ana = Analysis(rb, new_seq)
     # Cut at least inside
     self.assertEqual(ana.between(1, 7), {EcoRI: [2, 14]})
     # Cut at least inside and report only inside site
     self.assertEqual(ana.show_only_between(1, 7), {EcoRI: [2]})
     # Cut at least outside
     self.assertEqual(ana.outside(1, 7), {EcoRI: [2, 14]})
     # Don't cut within
     self.assertEqual(ana.do_not_cut(7, 12), {EcoRI: [2, 14]})
Ejemplo n.º 13
0
def findRestrictionSites(sequence, restr_batch):
    mySeq = Seq(sequence, IUPACAmbiguousDNA())
    rb = RestrictionBatch(restr_batch)
    analyze = Analysis(rb, mySeq)

    return analyze.full()
Ejemplo n.º 14
0
def get_restriction_table(seq, enzyme, circular=False):
    """
    Get the restriction table for a single genomic sequence.

    Parameters
    ----------
    seq : Seq object
        A biopython Seq object representing a chromosomes or contig.
    enzyme : int, str or list of str
        The name of the restriction enzyme used, or a list of restriction
        enzyme names. Can also be an integer, to digest by fixed chunk size.
    circular : bool
        Wether the genome is circular.

    Returns
    -------
    numpy.array:
        List of restriction fragment boundary positions for the input sequence.
    
    >>> from Bio.Seq import Seq
    >>> get_restriction_table(Seq("AAGATCGATCGG"),"DpnII")
    array([ 0,  2,  6, 12])
    >>> get_restriction_table(Seq("AA"),["DpnII", "HinfI"])
    array([0, 2])
    >>> get_restriction_table(Seq("AA"),"aeiou1")
    Traceback (most recent call last):
        ...
    ValueError: aeiou1 is not a valid restriction enzyme.
    >>> get_restriction_table("AA","DpnII")
    Traceback (most recent call last):
        ...
    TypeError: Expected Seq or MutableSeq instance, got <class 'str'> instead

    """
    chrom_len = len(seq)
    wrong_enzyme = "{} is not a valid restriction enzyme.".format(enzyme)
    # Restriction batch containing the restriction enzyme
    try:
        enz = [enzyme] if isinstance(enzyme, str) else enzyme
        cutter = RestrictionBatch(enz)
    except (TypeError, ValueError):
        try:
            cutter = max(int(enzyme), DEFAULT_MIN_CHUNK_SIZE)
        except ValueError:
            raise ValueError(wrong_enzyme)

    # Conversion from string type to restriction type
    if isinstance(cutter, int):
        sites = [i for i in range(0, chrom_len, cutter)]
        if sites[-1] < chrom_len:
            sites.append(chrom_len)
    else:
        # Find sites of all restriction enzymes given
        ana = Analysis(cutter, seq, linear=not circular)
        sites = ana.full()
        # Gets all sites into a single flat list with 0-based index
        sites = [site - 1 for enz in sites.values() for site in enz]
        # Sort by position and allow first add start and end of seq
        sites.sort()
        sites.insert(0, 0)
        sites.append(chrom_len)

    return np.array(sites)
Ejemplo n.º 15
0
 def test_analysis_restrictions(self):
     """Test Fancier restriction analysis
     """
     new_seq = Seq('TTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA',
                   IUPACAmbiguousDNA())
     rb = RestrictionBatch([EcoRI, KpnI, EcoRV])
     ana = Analysis(rb, new_seq, linear=False)
     self.assertEqual(
         ana.blunt(),
         {EcoRV: []})  # output only the result for enzymes which cut blunt
     self.assertEqual(ana.full(), {KpnI: [], EcoRV: [], EcoRI: [33]})
     self.assertEqual(
         ana.with_sites(),
         {EcoRI: [33]
          })  # output only the result for enzymes which have a site
     self.assertEqual(ana.without_site(), {
         KpnI: [],
         EcoRV: []
     })  # output only the enzymes which have no site
     self.assertEqual(ana.with_site_size([32]), {})
     self.assertEqual(ana.only_between(1, 20),
                      {})  # the enzymes which cut between position 1 and 20
     self.assertEqual(ana.only_between(20, 34), {EcoRI: [33]})  # etc...
     self.assertEqual(ana.only_between(34, 20),
                      {EcoRI: [33]})  # mix start end order
     self.assertEqual(ana.only_outside(20, 34), {})
     with self.assertWarns(BiopythonWarning):
         ana.with_name(['fake'])
     self.assertEqual(ana.with_name([EcoRI]), {EcoRI: [33]})
     self.assertEqual((ana._boundaries(1, 20)[:2]), (1, 20))
     self.assertEqual((ana._boundaries(20, 1)[:2]),
                      (1, 20))  # reverse order
     self.assertEqual((ana._boundaries(-1, 20)[:2]),
                      (20, 33))  # fix negative start
Ejemplo n.º 16
0
 def run_p(self):
     global userChoices, enzymes, fastaRead
     sh.log("\nstart run_p")
     sh.click()
     self.restrictResults.clear()
     self.numeralResults.clear()
     if not fastaRead:
         self.restrictResults.setPlainText(
             "You must select a fasta file first")
         return
     if len(userChoices) <= 0:
         self.restrictResults.setPlainText(
             "You must select R.Enzymes first")
         return
     self.detectPushButton.setEnabled(False)  # can't run twice
     try:
         linear = self.linearCheckBox.isChecked()
         analysis = Analysis(userChoices, self.sequence, linear=linear)
     except:
         sh.log("analysis failed " + sys.exc_info()[0])
     # print each enzyme with a list of it's matching sites
     cutSites = str(
         analysis.format_output(
             dct=None,
             title='',
             s1='\n  Enzymes which do not cut the sequence\n'))
     self.restrictResults.setPlainText(cutSites)
     # ------------------------------- FIND PALINDROME HIT COUNTS -----------------------------------------------
     try:
         endMarker = "END"
         enzymes.append(endMarker)
         # Extract enzymes and the index of their cutSites from cutSites
         palin = cutSites[:cutSites.find("Enzymes")].replace(
             '.', "").replace(':', "").split()
         palin.append(endMarker)
         sh.log("palin: " + str(palin))
     except:
         sh.log("palin NG " + sys.exec_info()[0])
     try:
         # Calculate and display the number of matching sites for each enzyme
         # enzPosn initally has a list of lists.  Each sublist has the enzyme name
         #   and the index of the enzyme in palin
         # enzPosn sublist later has the enzyme name and the number of matches.
         enzPosn = []
         enzNone = []
         sh.log("len palin " + str(len(palin)))
         sh.log("user choices " + str(userChoices))
         allChoices = userChoices
         allChoices.append(endMarker)  # matches last name in palin
         sh.log("allChoices " + str(allChoices))
         for enz in allChoices:
             if enz in palin:
                 enzPosn.append([enz, palin.index(enz)])
             else:
                 sh.log(enz + " not in palin")
                 enzNone.append(enz)
         sh.log("enzPosn = " + str(enzPosn))
         enzPosn.sort(key=lambda x: x[1])  # sort on index of name in palin
         for i in range(len(enzPosn) - 1):  # Replace the index with the
             enzPosn[i][1] = enzPosn[
                 i + 1][1] - enzPosn[i][1] - 1  # length of palin entry
         del enzPosn[-1]  # delete endMarker
         for enz in enzNone:
             enzPosn.append([enz,
                             0])  # add in enzymes not found; length = 0
         enzPosn.sort(key=lambda x: x[0])  # sort on name
         sh.log("enzPosn = " + str(enzPosn))
         for i in range(len(
                 enzPosn)):  # show the number of matches for each enzyme
             matchStr = "{0:7,d} : {1:s}\n\n".format(
                 enzPosn[i][1], enzPosn[i][0])
             self.numeralResults.insertPlainText(matchStr)
     except:
         sh.log('I cannot do that. ' + sys.exec_info()[0])
     self.detectPushButton.setEnabled(False)
     self.nPosPushButton.setEnabled(True)