Ejemplo n.º 1
0
def digest_genome(genome_fp, restriction_enzyme, output_dir, linear=False):
    base_fp = os.path.basename(genome_fp)
    if '.' in base_fp:
        base_fp = '{}.{}.fragments.bed'.format(base_fp[:base_fp.rfind('.')],
                                               restriction_enzyme)
    else:
        base_fp = '{}.{}.fragments.bed'.format(base_fp, restriction_enzyme)
    base_fp = os.path.join(output_dir, base_fp)
    if os.path.isfile(base_fp):
        overwrite = input(
            'WARNING: Overwriting existing fragment BED {}. Continue? [y/N]'.
            format(base_fp))
        if not overwrite.lower() == 'y':
            print("Did not overwrite existing fragment BED.")
            return
        os.remove(base_fp)
    print("Digesting")
    genome = None
    if "fasta" in genome_fp or "fa" in genome_fp:
        genome = SeqIO.parse(open(genome_fp, "rU"), format='fasta')
    else:
        genome = SeqIO.parse(open(genome_fp, "rU"), format='genbank')
    for chromosome in genome:
        print('{}\t{}'.format(chromosome.id, len(chromosome.seq)))
        # Digest the sequence data and return the cut points
        enzyme = RestrictionBatch([restriction_enzyme])
        for enzyme, cutpoints in enzyme.search(chromosome.seq,
                                               linear=linear).items():
            if len(cutpoints) == 0:
                print('No restriction sites found for {}'.format(
                    chromosome.id))
                continue
            df = pd.DataFrame(cutpoints, columns=['cutpoint'])
            df['end'] = df.cutpoint - 1
            df['start'] = df.end - (df.cutpoint.diff())
            df.loc[0, 'start'] = 0
            df['start'] = df['start'].astype('Int64')
            if len(df) > 1:
                last_fragment = pd.DataFrame({
                    'start': [df.loc[len(df) - 1, 'end']],
                    'end': [len(chromosome.seq)],
                    'cutpoint': [-1]
                })
                df = df.append(last_fragment, ignore_index=True)
            else:
                df.loc[len(df) - 1, 'end'] = len(chromosome.seq)
            df['frag_id'] = df.index
            # chromosome has 'chr'
            accession = chromosome.id
            version = ''
            if "." in chromosome.id:
                accession, version = chromosome.id.split(".")
            if not accession.startswith("chr"):
                accession = "chr" + accession
            df['chr'] = accession
            df[['chr', 'start', 'end', 'frag_id']].to_csv(base_fp,
                                                          index=False,
                                                          sep='\t',
                                                          mode='a',
                                                          header=None)
Ejemplo n.º 2
0
def REsearch(goi='', goiFile='', mcs='', mcsFile=''):
    rb = RestrictionBatch(suppliers=[
        'C', 'B', 'E', 'I', 'K', 'J', 'M', 'O', 'N', 'Q', 'S', 'R', 'V', 'Y',
        'X'
    ])

    goi = Seq(goi, IUPACUnambiguousDNA()) if goi else read_seq(goiFile)
    if not goi:
        raise Exception('Please provide a GOI sequence!')
    mcs = Seq(mcs, IUPACUnambiguousDNA()) if mcs else read_seq(mcsFile)
    if not mcs:
        raise Exception('Please provide a MCS sequence!')
    result_mcs = rb.search(mcs)
    result_goi = rb.search(goi)
    REs = set([e for e in result_mcs.keys() if result_mcs[e]]) - set(
        [e for e in result_goi.keys() if result_goi[e]])

    # ana = Analysis(RestrictionBatch(list(REs)), mcs)

    # REs_sorted = sorted(REs, key=lambda e: result_mcs[e])

    # result = {e: result_mcs[e] for e in REs_sorted}

    r = []
    for e in REs:
        for site in result_mcs[e]:
            r.append((str(e), site, "blunt" if e.is_blunt() else e.elucidate(),
                      ' '.join(e.suppl)))

    r.sort(key=lambda i: i[1])

    return r
Ejemplo n.º 3
0
def has_restriction_site(seq):
    from Bio.Seq import Seq
    from Bio.Restriction import RestrictionBatch

    mix = RestrictionBatch(restriction_sites)
    hits = mix.search(Seq(seq))

    return any(hits.values())
Ejemplo n.º 4
0
    def test_batch_analysis(self):
        """Sequence analysis with a restriction batch."""
        seq = Seq("AAAA" + EcoRV.site + "AAAA" + EcoRI.site + "AAAA")
        batch = RestrictionBatch([EcoRV, EcoRI])

        hits = batch.search(seq)
        self.assertEqual(hits[EcoRV], [8])
        self.assertEqual(hits[EcoRI], [16])
Ejemplo n.º 5
0
    def test_batch_analysis(self):
        """Sequence analysis with a restriction batch."""
        seq = Seq("AAAA" + EcoRV.site + "AAAA" + EcoRI.site + "AAAA",
                  IUPACAmbiguousDNA())
        batch = RestrictionBatch([EcoRV, EcoRI])

        hits = batch.search(seq)
        self.assertEqual(hits[EcoRV], [8])
        self.assertEqual(hits[EcoRI], [16])
Ejemplo n.º 6
0
def apply_restricts(dna, restricts, circular=False):
    '''Applies restriction site cleavage to forward and reverse strands.'''
    out_dnas = [dna]

    for restrict in restricts:
        batch = RestrictionBatch()
        batch.add(str(restrict))
        restrict = batch.get(str(restrict))
        out_dnas = _apply_restrict_to_dnas(out_dnas, restrict, circular)

    return out_dnas
Ejemplo n.º 7
0
 def OnDistribution(self, event):
     fasta_file = self.genome_name.GetValue()
     enzyme = RestrictionBatch([self.site_enzyme.GetStringSelection()]).get(self.site_enzyme.GetStringSelection())
     genome = ""
     if fasta_file and enzyme:
         with open(fasta_file, "rU") as handle:
             for record in SeqIO.parse(handle, "fasta") :
                 genome += record.seq
        
         plt.hist([len(i) for i in enzyme.catalyse(genome)],alpha=.3, bins=1000)
         plt.show()
Ejemplo n.º 8
0
 def OnDistribution(self, event):
     fasta_file = self.genome_name.GetValue()
     enzyme = RestrictionBatch([self.site_enzyme.GetStringSelection()]).get(self.site_enzyme.GetStringSelection())
     genome = ""
     if fasta_file and enzyme:
         with open(fasta_file, "rU") as handle:
             for record in SeqIO.parse(handle, "fasta") :
                 genome += record.seq
        
         plt.hist([len(i) for i in enzyme.catalyse(genome)],alpha=.3, bins=1000)
         plt.show()
Ejemplo n.º 9
0
 def test_analysis_restrictions(self):
     """Test Fancier restriction analysis."""
     new_seq = Seq('TTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA',
                   IUPACAmbiguousDNA())
     rb = RestrictionBatch([EcoRI, KpnI, EcoRV])
     ana = Analysis(rb, new_seq, linear=False)
     # Output only the result for enzymes which cut blunt:
     self.assertEqual(ana.blunt(), {EcoRV: []})
     self.assertEqual(ana.full(), {KpnI: [], EcoRV: [], EcoRI: [33]})
     # Output only the result for enzymes which have a site:
     self.assertEqual(ana.with_sites(), {EcoRI: [33]})
     # Output only the enzymes which have no site:
     self.assertEqual(ana.without_site(), {KpnI: [], EcoRV: []})
     self.assertEqual(ana.with_site_size([32]), {})
     # Output only enzymes which produce 5' overhangs
     self.assertEqual(ana.overhang5(), {EcoRI: [33]})
     # Output only enzymes which produce 3' overhangs
     self.assertEqual(ana.overhang3(), {KpnI: []})
     # Output only enzymes which produce defined ends
     self.assertEqual(ana.defined(), {KpnI: [], EcoRV: [], EcoRI: [33]})
     # Output only enzymes hich cut N times
     self.assertEqual(ana.with_N_sites(2), {})
     # The enzymes which cut between position x and y:
     with self.assertRaises(TypeError):
         ana.only_between('t', 20)
     with self.assertRaises(TypeError):
         ana.only_between(1, 't')
     self.assertEqual(ana.only_between(1, 20), {})
     self.assertEqual(ana.only_between(20, 34), {EcoRI: [33]})
     # Mix start/end order:
     self.assertEqual(ana.only_between(34, 20), {EcoRI: [33]})
     self.assertEqual(ana.only_outside(20, 34), {})
     with self.assertWarns(BiopythonWarning):
         ana.with_name(['fake'])
     self.assertEqual(ana.with_name([EcoRI]), {EcoRI: [33]})
     self.assertEqual((ana._boundaries(1, 20)[:2]), (1, 20))
     # Reverse order:
     self.assertEqual((ana._boundaries(20, 1)[:2]), (1, 20))
     # Fix negative start:
     self.assertEqual((ana._boundaries(-1, 20)[:2]), (20, 33))
     # Fix negative end:
     self.assertEqual((ana._boundaries(1, -1)[:2]), (1, 33))
     # Sites in- and outside of boundaries
     new_seq = Seq('GAATTCAAAAAAGAATTC', IUPACAmbiguousDNA())
     rb = RestrictionBatch([EcoRI])
     ana = Analysis(rb, new_seq)
     # Cut at least inside
     self.assertEqual(ana.between(1, 7), {EcoRI: [2, 14]})
     # Cut at least inside and report only inside site
     self.assertEqual(ana.show_only_between(1, 7), {EcoRI: [2]})
     # Cut at least outside
     self.assertEqual(ana.outside(1, 7), {EcoRI: [2, 14]})
     # Don't cut within
     self.assertEqual(ana.do_not_cut(7, 12), {EcoRI: [2, 14]})
Ejemplo n.º 10
0
    def re_sites(self, sequence):
        seq = Seq(sequence, IUPACAmbiguousDNA)
        # Set up analysis class with our enzymes and seq
        rb = RestrictionBatch(self.enzyme_set)

        # Do digest and reformat to dict of {site: enz, site:enz}
        re_sites = {}
        for enzyme, cutsites in rb.search(seq).items():
            for cut in cutsites:
                cut = cut + enzyme.fst3 - 1
                re_sites[cut] = enzyme
        return sorted(re_sites.items())
Ejemplo n.º 11
0
    def test_creating_batch(self):
        """Creating and modifying a restriction batch.
        """
        batch = RestrictionBatch([EcoRI])
        batch.add(KpnI)
        batch += EcoRV
        self.assertEqual(len(batch), 3)

        # The usual way to test batch membership
        self.assertIn(EcoRV, batch)
        self.assertIn(EcoRI, batch)
        self.assertIn(KpnI, batch)
        self.assertNotIn(SmaI, batch)
        # Syntax sugar for the above
        self.assertIn('EcoRV', batch)
        self.assertNotIn('SmaI', batch)

        batch.get(EcoRV)
        self.assertRaises(ValueError, batch.get, SmaI)

        batch.remove(EcoRV)
        self.assertEqual(len(batch), 2)

        self.assertNotIn(EcoRV, batch)
        self.assertNotIn('EcoRV', batch)
Ejemplo n.º 12
0
 def test_analysis_restrictions(self):
     """Test Fancier restriction analysis
     """
     new_seq = Seq('TTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA',
                   IUPACAmbiguousDNA())
     rb = RestrictionBatch([EcoRI, KpnI, EcoRV])
     ana = Analysis(rb, new_seq, linear=False)
     self.assertEqual(
         ana.blunt(),
         {EcoRV: []})  # output only the result for enzymes which cut blunt
     self.assertEqual(ana.full(), {KpnI: [], EcoRV: [], EcoRI: [33]})
     self.assertEqual(
         ana.with_sites(),
         {EcoRI: [33]
          })  # output only the result for enzymes which have a site
     self.assertEqual(ana.without_site(), {
         KpnI: [],
         EcoRV: []
     })  # output only the enzymes which have no site
     self.assertEqual(ana.with_site_size([32]), {})
     self.assertEqual(ana.only_between(1, 20),
                      {})  # the enzymes which cut between position 1 and 20
     self.assertEqual(ana.only_between(20, 34), {EcoRI: [33]})  # etc...
     self.assertEqual(ana.only_between(34, 20),
                      {EcoRI: [33]})  # mix start end order
     self.assertEqual(ana.only_outside(20, 34), {})
     with self.assertWarns(BiopythonWarning):
         ana.with_name(['fake'])
     self.assertEqual(ana.with_name([EcoRI]), {EcoRI: [33]})
     self.assertEqual((ana._boundaries(1, 20)[:2]), (1, 20))
     self.assertEqual((ana._boundaries(20, 1)[:2]),
                      (1, 20))  # reverse order
     self.assertEqual((ana._boundaries(-1, 20)[:2]),
                      (20, 33))  # fix negative start
Ejemplo n.º 13
0
 def __init__(be, bp_enzyme, price=None, units=None):
     enzyme.__init__(be, price, units)
     be.restriction_site = bp_enzyme.site
     be.site_len = len(be.restriction_site)
     be.name = RestrictionBatch([bp_enzyme]).as_string()[0]
     be.cut0 = bp_enzyme.charac[0]
     be.cut1 = bp_enzyme.charac[1] + be.site_len
     be.bp = True
     be.bp_enzyme = bp_enzyme
Ejemplo n.º 14
0
def calc_digest_products(seq, enzymes, *, is_circular):
    from more_itertools import pairwise, flatten
    from Bio.Restriction import RestrictionBatch
    from Bio.Seq import Seq

    if not enzymes:
        raise UsageError("no enzymes specified", enzymes=enzymes)

    enzymes = [re.sub('-HF(v2)?$', '', x) for x in enzymes]

    try:
        batch = RestrictionBatch(enzymes)
    except ValueError:
        raise ConfigError(
            lambda e: f"unknown enzyme(s): {','.join(map(repr, e.enzymes))}",
            enzymes=enzymes,
        ) from None

    sites = [x - 1 for x in flatten(batch.search(Seq(seq)).values())]

    if not sites:
        raise ConfigError(
            lambda e:
            f"{','.join(map(repr, e.enzymes))} {plural(enzymes):/does/do} not cut template.",
            enzymes=enzymes,
            seq=seq,
        )

    sites += [] if is_circular else [0, len(seq)]
    sites = sorted(sites)

    seqs = []
    for i, j in pairwise(sorted(sites)):
        seqs.append(seq[i:j])

    if is_circular:
        wrap_around = seq[sites[-1]:] + seq[:sites[0]]
        seqs.append(wrap_around)

    return seqs
Ejemplo n.º 15
0
    def fetch_restriction_sites(self, enzymes="Common"):
        """ 
            Spike in target variant first, generate list 
            restriction enzymes that will work.
        """
        if enzymes == "ALL":
            enzyme_group = AllEnzymes
        elif enzymes == "Common":
            enzyme_group = CommOnly
        elif enzymes == "HF":
            enzyme_group = high_fidelity
        else:
            enzyme_group = RestrictionBatch(enzymes.split(","))

        # Filter ambiguous cutters
        enzyme_group = RestrictionBatch(
            [x for x in enzyme_group if x.is_ambiguous() is False])

        # Calculate rflps for ALT sites only
        self.ref_sites = dict(list(enzyme_group.search(self.ref_seq).items()))
        self.primary_variant_sites = dict(
            list(enzyme_group.search(self.primary_variant_seq).items()))
        self.rflps = {
            k: (self.ref_sites[k], self.primary_variant_sites[k])
            for k, v in list(self.ref_sites.items())
            if len(v) > 0 and len(v) <= 3
            and self.ref_sites[k] != self.primary_variant_sites[k]
        }
Ejemplo n.º 16
0
def restriction_sites_present(spacer: str, rsb: RestrictionBatch) -> List[int]:
    """Determine if and where a set of restriction sites are present in a
    sequence\f

    Parameters
    ----------
    spacer : `str`
        Spacer sequence to examine for restriction sites.

    Returns
    -------
    :class:`typing.List`[`int`]
    """

    sites = bool([_ for results in rsb.search(Seq(spacer)).values() for _ in results])
    return sites
Ejemplo n.º 17
0
def get_restriction_site(enzyme):
    """Function to return a regex which corresponds to all possible restriction
    sites given a set of enzyme.

    Parameters:
    -----------
    enzyme : str
        String that contains the names of the enzyme separated by a comma.

    Returns:
    --------
    str :
        Regex that corresponds to all possible restriction sites given a set of
        enzyme.

    Examples:
    ---------
    >>> get_restriction_site('DpnII')
    'GATC'
    >>> get_restriction_site('DpnII,HinfI')
    'GA.TC|GATC'
    """

    # Split the str on the comma to separate the different enzymes.
    enzyme = enzyme.split(",")

    # Check on Biopython dictionnary the enzyme.
    rb = RestrictionBatch(enzyme)

    # Initiation:
    restriction_list = []

    # Iterates on the enzymes.
    for enz in rb:

        # Extract restriction sites and look for cut sites.
        restriction_list.append(enz.site.replace("N", "."))

    # Build the regex for all retsriction sites.
    pattern = "|".join(sorted(list(set(restriction_list))))
    return pattern
Ejemplo n.º 18
0
from vcfkit.utils.primer3 import primer3
from subprocess import Popen, PIPE, check_output
from .reference import resolve_reference_genome
np.set_printoptions(threshold=sys.maxsize)
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)

from Bio.Seq import Seq
from Bio.Restriction import AllEnzymes, CommOnly, RestrictionBatch

# Global flag for header output
header_printed = False

high_fidelity = RestrictionBatch([
    "AgeI", "ApoI", "BamHI", "BbsI", "BmtI", "BsaI", "BsiWI", "BsrGI",
    "BstEII", "BstZ17I", "DraIII", "EagI", "EcoRI", "EcoRV", "HindIII", "KpnI",
    "MfeI", "MluI", "NcoI", "NheI", "NotI", "NruI", "NsiI", "PstI", "PvuI",
    "PvuII", "SacI", "SalI", "SbfI", "ScaI", "SpeI", "SphI", "SspI", "StyI"
])

debug = None


class cvariant:
    """
        Mutable variant object
    """
    def __init__(self, variant):
        for i in [x for x in dir(variant) if x.startswith("_") is False]:
            setattr(self, i, getattr(variant, i))

Ejemplo n.º 19
0
        self.layer = 'Restriction Enzymes'

    def to_dict(self):
        r = super(Restriction_Site, self).to_dict()
        r['elucidate'] = self.enzyme.elucidate()
        r['cut'] = self.cut
        return r


_MyEnzymes = [
    AatII, AflII, AgeI, ApaI, ApaLI, AscI, AseI, BamHI, BclI, BglII, BstBI,
    ClaI, DraI, EagI, EarI, EcoRI, EcoRV, FspI, HindIII, HpaI, KpnI, MscI,
    NarI, NcoI, NdeI, NheI, NotI, NruI, PacI, PmlI, PstI, PvuII, SacI, SacII,
    SalI, SmaI, SpeI, StuI, XbaI, XhoI, XmaI
]
MyEnzymes = RestrictionBatch(
    [x for x in _MyEnzymes if x.elucidate().find('^') >= 0])


def find_restriction_sites(sequence, circular=True):
    input_seq = clean_sequence(sequence)
    if circular is True:
        input2 = Seq(input_seq + input_seq)
    else:
        input2 = Seq(input_seq)
    r = MyEnzymes.search(input2)
    cutter_list = []
    for enzyme in r:
        v = r[enzyme]
        for cut in v:
            cut_after = cut - 1
            if cut_after <= 0:
Ejemplo n.º 20
0
    def test_creating_batch(self):
        """Creating and modifying a restriction batch."""
        batch = RestrictionBatch()
        self.assertEqual(batch.suppl_codes()['N'], 'New England Biolabs')
        self.assertTrue(batch.is_restriction(EcoRI))
        batch = RestrictionBatch([EcoRI])
        batch.add(KpnI)
        batch += EcoRV
        self.assertEqual(len(batch), 3)
        self.assertEqual(batch.elements(), ['EcoRI', 'EcoRV', 'KpnI'])
        # Problem with Python 3, as sequence of list may be different:
        # self.assertEqual(batch.as_string(), ['EcoRI', 'KpnI', 'EcoRV'])
        self.assertIn('EcoRI', batch.as_string())

        # The usual way to test batch membership
        self.assertIn(EcoRV, batch)
        self.assertIn(EcoRI, batch)
        self.assertIn(KpnI, batch)
        self.assertNotIn(SmaI, batch)
        # Syntax sugar for the above
        self.assertIn('EcoRV', batch)
        self.assertNotIn('SmaI', batch)

        batch.get(EcoRV)
        self.assertRaises(ValueError, batch.get, SmaI)
        batch.get(SmaI, add=True)
        self.assertEqual(len(batch), 4)
        batch.remove(SmaI)
        batch.remove(EcoRV)
        self.assertEqual(len(batch), 2)

        self.assertNotIn(EcoRV, batch)
        self.assertNotIn('EcoRV', batch)

        # Creating a batch by addition of restriction enzymes
        new_batch = EcoRI + KpnI
        self.assertEqual(batch, new_batch)
        # or by addition of a batch with an enzyme
        another_new_batch = new_batch + EcoRV
        new_batch += EcoRV
        self.assertEqual(another_new_batch, new_batch)
        self.assertRaises(TypeError, EcoRI.__add__, 1)

        # Create a batch with suppliers and other supplier related methods
        # These tests may be 'update sensitive' since company names and
        # products may change often...
        batch = RestrictionBatch((), ('S'))  # Sigma
        self.assertEqual(batch.current_suppliers(),
                         ['Sigma Chemical Corporation'])
        self.assertIn(EcoRI, batch)
        self.assertNotIn(AanI, batch)
        batch.add_supplier('B')  # Life Technologies
        self.assertIn(AanI, batch)
Ejemplo n.º 21
0
def test_dseq():

    import textwrap
    from pydna.dseq import Dseq

    obj1 = Dseq("a", "t", circular=True)
    obj2 = Dseq("a", "t")

    with pytest.raises(TypeError):
        obj1 + obj2

    with pytest.raises(TypeError):
        obj2 + obj1

    with pytest.raises(TypeError):
        obj1 + ""

    with pytest.raises(AttributeError):
        obj2 + ""

    obj1 = Dseq("at", "t")
    obj2 = Dseq("a", "t")

    with pytest.raises(TypeError):
        obj1 + obj2

    obj = Dseq("aaa", "ttt", circular=True)
    assert obj[1:2] == Dseq("a", "t", 0)

    assert obj[:] == Dseq("aaa", "ttt", circular=False)

    obj = Dseq("atg", "cat", 0, circular=False)

    assert obj[1:2]._data == "atg"[1:2]

    assert obj[2:1]._data == "atg"[2:1]

    assert obj.reverse_complement() == obj.rc() == Dseq("cat", "atg", 0)

    obj = Dseq("atg", "cat", circular=True)

    assert obj.looped() == obj

    assert obj[:] == Dseq("atg", "cat", 0, circular=False)

    assert obj[1:2]._data == "atg"[1:2]

    assert obj[2:1]._data == "ga"

    obj = Dseq("G", "", 0)
    assert obj.five_prime_end() == ("5'", "g")
    obj = Dseq("", "C", 0)
    assert obj.five_prime_end() == ("3'", "c")

    obj = Dseq("ccGGATCC", "aaggatcc", -2)
    assert obj._data == "ccGGATCCtt"
    assert str(obj.mung()) == "GGATCC"
    rpr = textwrap.dedent(
        """
    Dseq(-10)
    ccGGATCC
      cctaggaa
    """
    ).strip()
    assert repr(obj) == rpr

    assert obj[3] == Dseq("G", "c", 0)

    assert obj.fill_in() == Dseq("ccGGATCCtt", "aaggatccgg", 0)

    assert obj + Dseq("") == obj
    assert Dseq("") + obj == obj

    obj = Dseq("gatcAAAAAA", "gatcTTTTTT")
    assert obj.fill_in("gatc") == Dseq("gatcAAAAAAgatc", "gatcTTTTTTgatc")
    assert obj.fill_in("atc") == obj
    assert obj.fill_in("ac") == obj
    assert obj.fill_in("at") == obj

    obj = Dseq("AAAAAAgatc", "TTTTTTgatc")
    assert obj.fill_in("gatc") == obj
    assert obj.fill_in("atc") == obj
    assert obj.fill_in("ac") == obj
    assert obj.fill_in("at") == obj

    obj = Dseq("gatcAAAAAA", "gatcTTTTTT")
    assert obj.t4() == Dseq("gatcAAAAAAgatc", "gatcTTTTTTgatc")

    assert obj.t4("at") == obj
    assert obj.t4("atg") == Dseq("gatcAAAAAAgat", "gatcTTTTTTgat")
    assert obj.t4("atgc") == Dseq("gatcAAAAAAgatc", "gatcTTTTTTgatc")
    assert obj.mung() == Dseq("AAAAAA", "TTTTTT")

    obj = Dseq("AAAAAAgatc", "TTTTTTgatc")
    assert obj.t4() == obj.t4("at") == Dseq("AAAAAA")
    assert obj.t4("atc") == obj.t4("atg") == obj.t4("atcg") == Dseq("AAAAAA")

    assert Dseq("GGATCC", "GGATCC").t4() == Dseq("GGATCC", "GGATCC")
    assert Dseq("GGATCCa", "GGATCC").t4() == Dseq("GGATCC", "GGATCC")
    assert Dseq("aGGATCC", "GGATCC").t4() == Dseq("aGGATCC", "GGATCCt")
    assert Dseq("aGGATCCa", "GGATCC").t4() == Dseq("aGGATCC", "GGATCCt")
    assert Dseq("GGATCC", "aGGATCC").t4() == Dseq("GGATCCt", "aGGATCC")
    assert Dseq("GGATCC", "GGATCCa").t4() == Dseq("GGATCC", "GGATCC")
    assert Dseq("GGATCC", "aGGATCCa").t4() == Dseq("GGATCCt", "aGGATCC")

    assert Dseq("GGATCC", "ATCC").t4("g") == Dseq("gg", "", ovhg=0)
    assert Dseq("GGATCC", "GGATCC").t4("gat") == Dseq("ggat", "ggat", ovhg=-2)

    a2 = Dseq("ccGGATCCaa", "ggatcc", -2)
    assert a2._data == "ccGGATCCaa"
    assert a2._data == "ccGGATCCaa"
    assert str(a2.mung()) == "GGATCC"
    rpr = textwrap.dedent(
        """
    Dseq(-10)
    ccGGATCCaa
      cctagg
    """
    ).strip()
    assert repr(a2) == rpr

    a3 = Dseq("ccGGATCC", "ggatcc", -2)
    assert a3._data == "ccGGATCC"
    assert a3._data == "ccGGATCC"
    assert str(a3.mung()) == "GGATCC"
    rpr = textwrap.dedent(
        """
    Dseq(-8)
    ccGGATCC
      cctagg
    """
    ).strip()
    assert repr(a3) == rpr

    b = Dseq("GGATCC", "aaggatcccc", 2)
    assert b._data == "ggGGATCCtt"
    assert b._data == "ggGGATCCtt"
    assert str(b.mung()) == "GGATCC"
    rpr = textwrap.dedent(
        """
    Dseq(-10)
      GGATCC
    cccctaggaa
    """
    ).strip()
    assert repr(b) == rpr

    b2 = Dseq("GGATCCaa", "ggatcccc", 2)
    assert b2._data == "ggGGATCCaa"
    assert b2._data == "ggGGATCCaa"
    assert str(b2.mung()) == "GGATCC"
    rpr = textwrap.dedent(
        """
    Dseq(-10)
      GGATCCaa
    cccctagg
    """
    ).strip()
    assert repr(b2) == rpr

    assert b2.seguid() == "hPNrcQ0sluXyfu4XuUh1trsnygc"

    b3 = Dseq("GGATCC", "ggatcccc", 2)
    assert b3._data == "ggGGATCC"
    assert b3._data == "ggGGATCC"
    assert str(b3.mung()) == "GGATCC"
    rpr = textwrap.dedent(
        """
    Dseq(-8)
      GGATCC
    cccctagg
    """
    ).strip()
    assert repr(b3) == rpr

    c = Dseq("GGATCCaaa", "ggatcc", 0)
    assert c._data == "GGATCCaaa"
    assert c._data == "GGATCCaaa"
    assert str(c.mung()) == "GGATCC"
    rpr = textwrap.dedent(
        """
    Dseq(-9)
    GGATCCaaa
    cctagg
    """
    ).strip()
    assert repr(c) == rpr

    d = Dseq("GGATCC", "aaaggatcc", 0)
    assert d._data == "GGATCCttt"
    assert d._data == "GGATCCttt"
    assert str(d.mung()) == "GGATCC"
    rpr = textwrap.dedent(
        """
    Dseq(-9)
    GGATCC
    cctaggaaa
    """
    ).strip()
    assert repr(d) == rpr

    obj = Dseq("GGATCCaaa", "ggatcc", 0)
    from Bio.Restriction import BamHI

    frag1 = Dseq("G", "gatcc", 0)
    frag2 = Dseq("GATCCaaa", "g", -4)

    assert obj.cut(BamHI) == (frag1, frag2)

    assert frag1 + frag2 == obj

    obj.seguid() == "HtK7-_BmOJw0BmtYE8f1yGdHc0c"

    assert frag1.seguid() == "yJkorWG5V2etvSLp6E6QNK-KMlQ"
    assert frag2.seguid() == "Aw3buI-N85OztBZAzeGJvXGlwO8"

    obj = Dseq("tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta")
    assert (
        repr(obj)
        == "Dseq(-30)\ntagcgtagctgtagtatgtgatctggtcta\natcgcatcgacatcatacactagaccagat"
    )

    obj2 = Dseq("tagcgtagctgtagtatgtgatctggtcta")

    obj3 = obj = Dseq(
        "tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta", 0
    )

    assert obj == obj2 == obj3

    assert obj.find("ggatcc") == -1

    assert obj.find("tgtagta") == 9

    obj = Dseq("tagcgtagctgtagtatgtgatctggtctaa", "ttagaccagatcacatactacagctacgcta")

    obj = Dseq("tagcgtagctgtagtatgtgatctggtctaa", "CCCttagaccagatcacatactacagctacgcta")

    assert repr(obj) == "Dseq(-34)\ntagc..ctaa   \natcg..gattCCC"

    obj = Dseq("tagcgtagctgtagtatgtgatctggtctaaCCC", "ttagaccagatcacatactacagctacgcta")

    assert repr(obj) == "Dseq(-34)\ntagc..ctaaCCC\natcg..gatt   "

    obj = Dseq("agcgtagctgtagtatgtgatctggtctaa", "ttagaccagatcacatactacagctacgcta")
    assert repr(obj) == "Dseq(-31)\n agcg..ctaa\natcgc..gatt"

    obj = Dseq("Atagcgtagctgtagtatgtgatctggtctaa", "ttagaccagatcacatactacagctacgcta")
    assert repr(obj) == "Dseq(-32)\nAtagc..ctaa\n atcg..gatt"

    obj = Dseq(
        "tagcgtagctgtagtatgtgatctggtctaa", "tatcgcatcgacatcatacactagaccagatt"[::-1]
    )

    assert repr(obj) == "Dseq(-32)\n tagc..ctaa\ntatcg..gatt"

    assert round(obj.mw(), 1) == 19535.6

    obj1 = Dseq(
        "tagcgtagctgtagtatgtgatctggtcta",
        "tagaccagatcacatactacagctacgcta",
        circular=True,
        linear=False,
    )
    obj2 = Dseq(
        "tagcgtagctgtagtatgtgatctggtcta",
        "tagaccagatcacatactacagctacgcta",
        circular=True,
    )
    obj3 = Dseq(
        "tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta", linear=False
    )

    assert obj1 == obj2 == obj3

    assert obj1.find("ggatcc") == -1

    assert obj1.find("tgtagta") == 9

    assert (
        Dseq(
            "tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta"
        ).looped()
        == obj1
    )

    from Bio.Restriction import BglII, BamHI

    obj = Dseq("ggatcc")

    assert BglII in obj.no_cutters()
    assert BamHI not in obj.no_cutters()

    assert BamHI in obj.unique_cutters()

    assert BamHI in obj.once_cutters()

    assert BamHI in (obj + obj).twice_cutters()
    assert BamHI not in obj.twice_cutters()

    assert BamHI in obj.n_cutters(1)
    assert BamHI in obj.cutters()

    from Bio.Restriction import RestrictionBatch

    rb = RestrictionBatch((BamHI, BglII))

    assert obj.cut(rb) == obj.cut(BamHI, BglII) == obj.cut(BglII, BamHI)

    obj = Dseq("ggatccAGATCT")

    assert obj.cut(rb) == obj.cut(BamHI, BglII) == obj.cut(BglII, BamHI)

    obj = Dseq("AGATCTggatcc")

    assert obj.cut(rb) == obj.cut(BamHI, BglII) == obj.cut(BglII, BamHI)

    obj = Dseq("ggatccAGATCT", circular=True)

    assert obj.cut(rb) == obj.cut(BamHI, BglII) != obj.cut(BglII, BamHI)

    obj = Dseq("AGATCTggatcc", circular=True)

    assert obj.cut(rb) == obj.cut(BglII, BamHI) != obj.cut(BamHI, BglII)
Ejemplo n.º 22
0
def get_restriction_table(seq, enzyme, circular=False):
    """
    Get the restriction table for a single genomic sequence.

    Parameters
    ----------
    seq : Seq object
        A biopython Seq object representing a chromosomes or contig.
    enzyme : int, str or list of str
        The name of the restriction enzyme used, or a list of restriction
        enzyme names. Can also be an integer, to digest by fixed chunk size.
    circular : bool
        Wether the genome is circular.

    Returns
    -------
    numpy.array:
        List of restriction fragment boundary positions for the input sequence.
    
    >>> from Bio.Seq import Seq
    >>> get_restriction_table(Seq("AAGATCGATCGG"),"DpnII")
    array([ 0,  2,  6, 12])
    >>> get_restriction_table(Seq("AA"),["DpnII", "HinfI"])
    array([0, 2])
    >>> get_restriction_table(Seq("AA"),"aeiou1")
    Traceback (most recent call last):
        ...
    ValueError: aeiou1 is not a valid restriction enzyme.
    >>> get_restriction_table("AA","DpnII")
    Traceback (most recent call last):
        ...
    TypeError: Expected Seq or MutableSeq instance, got <class 'str'> instead

    """
    chrom_len = len(seq)
    wrong_enzyme = "{} is not a valid restriction enzyme.".format(enzyme)
    # Restriction batch containing the restriction enzyme
    try:
        enz = [enzyme] if isinstance(enzyme, str) else enzyme
        cutter = RestrictionBatch(enz)
    except (TypeError, ValueError):
        try:
            cutter = max(int(enzyme), DEFAULT_MIN_CHUNK_SIZE)
        except ValueError:
            raise ValueError(wrong_enzyme)

    # Conversion from string type to restriction type
    if isinstance(cutter, int):
        sites = [i for i in range(0, chrom_len, cutter)]
        if sites[-1] < chrom_len:
            sites.append(chrom_len)
    else:
        # Find sites of all restriction enzymes given
        ana = Analysis(cutter, seq, linear=not circular)
        sites = ana.full()
        # Gets all sites into a single flat list with 0-based index
        sites = [site - 1 for enz in sites.values() for site in enz]
        # Sort by position and allow first add start and end of seq
        sites.sort()
        sites.insert(0, 0)
        sites.append(chrom_len)

    return np.array(sites)
Ejemplo n.º 23
0
def _catalyze(record: SeqRecord,
              enzymes: List[RestrictionType],
              linear=True) -> List[Tuple[str, SeqRecord, str]]:
    """Catalyze a SeqRecord and return all post-digest SeqRecords with overhangs.

    Overhangs are returned as the overhang plus the position of the cut
    in the 5' end (^) and 3' end (_). So a 5' overhang may be:
    ^AAAA_. But a 3' overhang may be: _AAAA^.

    Args:
        record: The SeqRecord to digest with enzymes
        enzymes: List of enzymes to digest the input records with

    Keyword Args:
        linear: Whether the record to catalyze is linear or circular

    Returns:
        Tuple with: (left overhang, cut fragment, right overhang)
    """

    record = record.upper()
    batch = RestrictionBatch(enzymes)
    batch_sites = batch.search(record.seq, linear=linear)

    # order all cuts with enzymes based on index
    cuts_seen: Set[int] = set()
    enzyme_cuts: List[Tuple[RestrictionType, int]] = []
    for enzyme, cuts in batch_sites.items():
        for cut in cuts:
            if cut in cuts_seen:
                continue
            cuts_seen.add(cut)
            enzyme_cuts.append((enzyme, cut - 1))  # revert to 0-based
    enzyme_cuts = sorted(enzyme_cuts, key=lambda x: x[1])

    # list of left/right overhangs for each fragment
    frag_w_overhangs: List[Tuple[str, SeqRecord, str]] = []
    for i, (enzyme, cut) in enumerate(enzyme_cuts):
        if i == len(enzyme_cuts) - 1 and linear:
            continue

        next_enzyme, next_cut = enzyme_cuts[(i + 1) % len(enzyme_cuts)]

        enzyme_len = len(enzyme.ovhgseq)
        next_enzyme_len = len(next_enzyme.ovhgseq)

        # shift cuts left for 3overhang enzymes
        if enzyme.is_3overhang():
            cut -= enzyme_len
        if next_enzyme.is_3overhang():
            next_cut -= next_enzyme_len

        cut_rc = cut if enzyme.is_3overhang() else cut + enzyme_len
        next_cut_rc = (next_cut if next_enzyme.is_3overhang() else next_cut +
                       next_enzyme_len)

        # find the cutsite sequences
        left = record[cut:cut + enzyme_len]
        right = record[next_cut:next_cut + next_enzyme_len]
        left_rc = right.reverse_complement()
        right_rc = left.reverse_complement()

        left = str(left.seq)
        right = str(right.seq)
        left_rc = str(left_rc.seq)
        right_rc = str(right_rc.seq)

        if enzyme.is_3overhang():
            left += "^"
            right_rc += "^"
        else:
            left = "^" + left
            right_rc = "^" + right_rc

        if next_enzyme.is_3overhang():
            right += "^"
            left_rc += "^"
        else:
            right = "^" + right
            left_rc = "^" + left_rc

        # shift cuts right again for 3overhang enzymes
        if enzyme.is_3overhang():
            cut += enzyme_len
        if next_enzyme.is_3overhang():
            next_cut += next_enzyme_len

        frag = record[cut:next_cut]
        frag_rc = record[cut_rc:next_cut_rc].reverse_complement()
        frag_rc.id = record.id

        if next_cut < cut:  # wraps around the zero-index
            frag = (record + record)[cut:next_cut + len(record)]
            frag.id = record.id
            frag_rc = (record + record)[cut_rc:next_cut_rc +
                                        len(record)].reverse_complement()
            frag_rc.id = record.id

        frag_w_overhangs.append((left, frag, right))
        frag_w_overhangs.append((left_rc, frag_rc, right_rc))

    return frag_w_overhangs
Ejemplo n.º 24
0
    def test_creating_batch(self):
        """Creating and modifying a restriction batch."""
        batch = RestrictionBatch()
        self.assertEqual(batch.suppl_codes()['N'], 'New England Biolabs')
        self.assertTrue(batch.is_restriction(EcoRI))
        batch = RestrictionBatch([EcoRI])
        batch.add(KpnI)
        batch += EcoRV
        self.assertEqual(len(batch), 3)
        self.assertEqual(batch.elements(), ['EcoRI', 'EcoRV', 'KpnI'])
        # Problem with Python 3, as sequence of list may be different:
        # self.assertEqual(batch.as_string(), ['EcoRI', 'KpnI', 'EcoRV'])
        self.assertIn('EcoRI', batch.as_string())

        # The usual way to test batch membership
        self.assertIn(EcoRV, batch)
        self.assertIn(EcoRI, batch)
        self.assertIn(KpnI, batch)
        self.assertNotIn(SmaI, batch)
        # Syntax sugar for the above
        self.assertIn('EcoRV', batch)
        self.assertNotIn('SmaI', batch)

        batch.get(EcoRV)
        self.assertRaises(ValueError, batch.get, SmaI)
        batch.get(SmaI, add=True)
        self.assertEqual(len(batch), 4)
        batch.remove(SmaI)
        batch.remove(EcoRV)
        self.assertEqual(len(batch), 2)

        self.assertNotIn(EcoRV, batch)
        self.assertNotIn('EcoRV', batch)

        # Creating a batch by addition of restriction enzymes
        new_batch = EcoRI + KpnI
        self.assertEqual(batch, new_batch)
        # or by addition of a batch with an enzyme
        another_new_batch = new_batch + EcoRV
        new_batch += EcoRV
        self.assertEqual(another_new_batch, new_batch)
        self.assertRaises(TypeError, EcoRI.__add__, 1)

        # Create a batch with suppliers and other supplier related methods
        # These tests may be 'update sensitive' since company names and
        # products may change often...
        batch = RestrictionBatch((), ('S'))  # Sigma
        self.assertEqual(batch.current_suppliers(),
                         ['Sigma Chemical Corporation'])
        self.assertIn(EcoRI, batch)
        self.assertNotIn(AanI, batch)
        batch.add_supplier('B')  # Life Technologies
        self.assertIn(AanI, batch)
Ejemplo n.º 25
0
def find_spacers(
    itemlist: pyfaidx.Fasta,
    nuclease_info: dict,
    restriction_sites: Optional[List[str]] = None,
    chunks: int = 8,
) -> pd.DataFrame:
    """Find protospacers in a sequence for a given nuclease.  The search region can be more
    expansive than just the PAM (for scoring purposes) and strand can be taken into account.
    `find_spacers()` will ignore sequences that have a poly(T) sequence, high GC content, or
    a motif matching given restriction nuclease sequence.

    Parameters
    ----------
    itemlist : :class:`~pyfaidx.Fasta`
        Parsed FASTA with sequences to examine for spacers
    nuclease_info : dict
        Information for the nuclease to use.  Required keys include `start`, `end`, `strand`,
        `pam`, `spacer_regex`
    restriction_sites : `List[str]`, optional (default: `None`)
        For a found spacer, ignore it if it contains the sequence for recognition by these
        restriction endonucleases.
    chunks : `int`, optional (default: 8)
        Number of pieces to divide the spacer dataframe into.  Higher number
        means less memory used at a time, but may result in slower processing

    Return
    ------
    :class:`~pandas.DataFrame`
    """
    spacer_regex = regex.compile(nuclease_info["spacer_regex"])
    spacer_start: int = nuclease_info["start"]
    spacer_end: int = nuclease_info["end"]

    # Set the restriction sites that we are going to make sure are not in our
    # spacers
    if restriction_sites:
        rsb = RestrictionBatch(restriction_sites)
    else:
        rsb = None

    # For each entry in the file (i.e. exonic sequence), find all of the
    # potential protospacer sequences.
    spacers_df = fasta_to_df(itemlist)

    tqdm.pandas(desc="finding forward spacers", unit="sequences")
    spacers_df["forward_spacers"] = spacers_df["sequence"].progress_apply(
        spacer_regex.findall
    )
    tqdm.pandas(desc="finding reverse spacers", unit="sequences")
    spacers_df["reverse_spacers"] = spacers_df["reverse_complement"].progress_apply(
        spacer_regex.findall
    )
    spacers_df = spacers_df.drop(columns=["sequence", "reverse_complement"])

    chunked_spacer_dfs = np.array_split(spacers_df, chunks)
    pivot_partial = partial(
        pivot_spacers,
        spacer_start=spacer_start,
        spacer_end=spacer_end,
        restriction_sites=rsb,
    )
    spacers_df = pd.concat(map(pivot_partial, chunked_spacer_dfs))

    # duplicates were sneaking in.
    spacers_df = spacers_df.groupby("spacer").first().reset_index()

    return spacers_df
Ejemplo n.º 26
0
def restriction_supplier(seq,
                         max_band=23130,
                         min_band=2000,
                         suppliers='ACEGFIHKJMONQPSRUVX',
                         linear=False,
                         p='yes'):
    """
    Performs restriction endonuclease analysis by batch, based on supplier.
    
    Parameters:
        seq = DNA sequence for restriction endonuclease digestion
        max_band = size of maximum band in basepairs. Default = 23130
        mmin_band = size of minimum band in basepairs. Default = 2000
        suppliers = restriction enzyme supplier. Default = ACEGFIHKJMONQPSRUVX
            where
                A = Amersham Pharmacia Biotech
                C = Minotech Biotechnology
                E = Stratagene
                G = Qbiogene
                F = Fermentas AB
                I = SibEnzyme Ltd.
                H = American Allied Biochemical, Inc.
                K = Takara Shuzo Co. Ltd.
                J = Nippon Gene Co., Ltd.
                M = Roche Applied Science
                O = Toyobo Biochemicals
                N = New England Biolabs
                Q = CHIMERx
                P = Megabase Research Products
                S = Sigma Chemical Corporation
                R = Promega Corporation
                U = Bangalore Genei
                V = MRC-Holland
                X = EURx Ltd.
        linear = flag to define if DNA sequence is linear.
            Default = False (DNA is circular)
        p = flag to determine if the data is to be printed. Default = yes

    Returns:
    {Restriction endonuclease :
        (Total number of fragments after digestion,
        Number of fragments with molecular size above max_band, 
        Number of fragments with molecular size between max_band and min_band, 
        Number of fragments with molecular size below min_band)}
    """
    from Bio.Restriction import RestrictionBatch
    count = 0
    result = {}
    for enzyme in RestrictionBatch(first=[],
                                   suppliers=[x.upper() for x in suppliers]):
        try:
            digest = restriction_digest(seq, enzyme, max_band, min_band,
                                        linear, p)
        except MemoryError:
            print('Memory Error during ' + str(enzyme) + ' digestion')
        result[str(enzyme)] = (digest[0], len(digest[1]), len(digest[2]),
                               len(digest[3]))
        count = count + 1
        if p != 'yes':
            if count % 10 == 0:
                print(str(count) + ' restriction endonuclease processed')
    return result
Ejemplo n.º 27
0
def findRestrictionSites(sequence, restr_batch):
    mySeq = Seq(sequence, IUPACAmbiguousDNA())
    rb = RestrictionBatch(restr_batch)
    analyze = Analysis(rb, mySeq)

    return analyze.full()
Ejemplo n.º 28
0
def analyze(name,
            name_bank,
            enzyme,
            genome_index,
            genome_fasta,
            genome_fastq,
            tag_length=6,
            looping=False,
            speed_looping=4,
            quality_min=30,
            tot_len_read=700,
            len_paired_wise_fastq=3,
            paired_wise_fastq=True,
            bowtie2=os.path.join(working_directory, "bowtie2"),
            bank_folder=os.path.join(toolbox_directory, "results"),
            ncpu=4):

    start_total = time.clock()
    hostname = socket.gethostname()
    print "Host name:", hostname
    ordi = hostname.split('.')[0]

    # if ordi == 'renoir':
    #     folder_alignment_toolbox = '/Volumes/Data/HiC_project/alignment_toolbox'
    #     bowtie2 = '/Volumes/Data/HiC_project/alignment_toolbox/bowtie2-2.0.0-beta5/'
    #     out_foldR = '/Volumes/Data/hic_data/27_08_2013/results'
    #     base_folder= '/Volumes/Data/hic_data/27_08_2013'
    #     ncpu = 24
    # else:
    folder_alignment_toolbox = toolbox_directory
    #     bowtie2 = os.path.join(working_directory, "bowtie2")
    #     out_foldR = os.path.join(toolbox_directory, "results")
    base_folder = working_directory
    #     ncpu = 4
    out_foldR = bank_folder
    if not (os.path.exists(out_foldR)):
        os.mkdir(out_foldR)

    ########################## PARAMETERS #################################################################################
#     name = 'Vibrio_WT'
#     name_bank = 'Vibrio_209'

    folder_a = os.path.join(base_folder, '')
    folder_b = os.path.join(base_folder, '')

    if enzyme in AllEnzymes:
        restriction_site = RestrictionBatch([enzyme]).get(enzyme).site  # HpaII
    else:
        restriction_site = enzyme

    #folder_alignment_toolbox +
#     genome_index = os.path.join(working_directory, "index/"+name)
#     genome_fasta = os.path.join(working_directory, "fasta/"+name+".fa")

#     tag_length = 6
#     looping = False
#     speed_looping = 4
#     quality_min = 30
    print name_bank
    print out_foldR
    output_folder = os.path.join(out_foldR, bank_folder)
    #     paired_wise_fastq = True
    #     len_paired_wise_fastq = 3
    #     tot_len_read = 700
    print genome_fastq
    print genome_fastq.split(',')
    if len(genome_fastq.split(',')) <= 1:
        motif_read_1 = genome_fastq
        print "Reading " + motif_read_1
        if motif_read_1[-1] == '2':
            motif_read_2 = motif_read_1[:-1] + '1'
            print "Reading " + motif_read_2
        elif motif_read_1[-1] == '1':
            motif_read_2 = motif_read_1[:-1] + '2'
            print "Reading " + motif_read_2
        else:
            motif_read_2 = ""
            print "Warning: no second fastq file found"
    else:
        motif_read_1 = genome_fastq.split(',')[0]
        motif_read_2 = genome_fastq.split(',')[1]

    print motif_read_1
    print motif_read_2
    #######################################################################################################################
    hic_bank = hic_exp.hic_exp(name_bank, tot_len_read, folder_a, folder_b,
                               motif_read_1, motif_read_2, paired_wise_fastq,
                               restriction_site, ncpu, tag_length,
                               genome_index, genome_fasta, bowtie2, looping,
                               quality_min, output_folder, speed_looping,
                               len_paired_wise_fastq)
    start = time.clock()
    hic_bank.align()
    hic_bank.pcr_free()
    hic_bank.paired_reads_2_fragments()
    elapsed = (time.clock() - start)
    print 'Paired reads aligned in  ' + str(elapsed) + ' s'
    print " start computing biases..."
    hic_bank.gc_size_bias()
    print " done."
    print "writing abs weighted contacts"
    hic_bank.fragments_contacts_2_weighted_contacts()
    elapsed = (time.clock() - start_total)
    print "all done in " + str(elapsed) + " s"
    print "ready for computation"
    return True
Ejemplo n.º 29
0
 def __init__(self, enzymes, sequence, is_linear=True):
     self.enzymes = enzymes
     self.sequence = sequence
     self.res_batch = RestrictionBatch(enzymes)
     self.is_linear = is_linear
     self.site_dict = self.res_batch.search(self.sequence.seq, is_linear)
Ejemplo n.º 30
0
class DigestedSequence:
    def __init__(self, enzymes, sequence, is_linear=True):
        self.enzymes = enzymes
        self.sequence = sequence
        self.res_batch = RestrictionBatch(enzymes)
        self.is_linear = is_linear
        self.site_dict = self.res_batch.search(self.sequence.seq, is_linear)

    def get_sites(self):
        """
        Return the set of sites for a given contig, ordered by increasing position.
        :return: list of CutSites
        """
        cutSites = []
        for e_name, ctg_locs in self.site_dict.iteritems():
            for loc in ctg_locs:
                cutSites.append(CutSite(e_name, loc))

        return sorted(cutSites)

    def get_fragments(self):
        """
        Return the genomic fragments resulting from the digestion.
        :return: list of SeqRecords.
        """
        sites = self.get_sites()
        seq = self.sequence

        if len(sites) == 0:
            return seq

        frags = []
        for idx in xrange(1, len(sites)):
            a = sites[idx - 1]
            b = sites[idx]
            frg = seq[a:b]
            frg.id = "{0}:{1}:{2}".format(frg.id, a, b)
            frg.name = frg.id
            frg.description = "restriction digest fragment from {0} to {1}".format(a, b)
            frags.append(frg)

        return frags

    @staticmethod
    def digestion_sites(seq_list, enzyme_names=[], min_sites=1):
        """
        Return a list of sites per sequence, preserving the input list order.
        :param seq_list: list of sequences to analyze
        :param enzyme_names: enzyme used in digestion
        :param min_sites: minimum sites required for a sequence to be included.
        :param min_length: minimum sequence length to be included.
        :return: list of sites per sequence
        """
        sites = []
        for seq in seq_list:
            if seq["excluded"]:
                continue

            ds = DigestedSequence(enzyme_names, seq["record"])
            seq_sites = ds.get_sites()

            if len(seq_sites) < min_sites:
                print "\tExcluded {0} (length {1}) with only {2} sites".format(
                    seq["record"].id, len(seq["record"]), len(seq_sites)
                )
                # continue

            sites.append({"name": seq["record"].id, "pos": seq_sites})
        return sites
Ejemplo n.º 31
0
def find_spacers(target=None, outfile=None, refgenome=None, restriction_sites=[], largeIndex=False, cutoff=0,
                 offtargetcutoff=0, trim=False, logging=False, nuclease='Cas9', return_limit=9, reject=False):

    with open(target, 'rU') as infile:
        # Use our modified FastaIterator instead of, perhaps the more proper, SeqIO.parse() method
        # allows us to trim the UTR sequences off
        if trim:
        # If the 'trim' option is enabled, the header for each entry must be
        # GENEID | TRANSCRIPTID | EXON RANK | CONSTITUTIVE EXON | 5' UTR END | 3' UTR STOP | EXON START | EXON END
            try:
                itemiter = m_FastaIterator(infile)
            except IOError as e:
                print("I/O error({0}): {1}".format(e.errno, e.strerror))
        else:
            try:
                itemiter = SeqIO.parse(infile, 'fasta')
            except IOError as e:
                print("I/O error({0}): {1}".format(e.errno, e.strerror))
        itemlist = [temp for temp in itemiter]

    spacerlist = []

    # This will find our 20N-NGG target sequence plus the -4->-3 and +1->+3 nucleotides for scoring
    if nuclease == 'Cas9':
        PAM = r'(?i)[ACGT]{25}[G]{2}[ACGT]{3}'
    elif nuclease == 'Cpf1':
        PAM = r'(?i)[T]{2,}[A-Z]{25}'

    rsb = RestrictionBatch(restriction_sites)

    #spacerlist = map(lambda item: find_each_spacer(item, rsb, cutoff, PAM), itemlist)

    seen = []
    print("{} sequences to search for spacers.".format(len(itemlist)))
    widgets = ['Examining sequence: ', progressbar.Counter(), ' ', progressbar.Percentage(), ' ',
               progressbar.Bar(), progressbar.Timer()]
    progress = progressbar.ProgressBar(widgets=widgets, maxval=len(itemlist)).start()

    spacer_re = regex.compile(PAM)
    #pol3stop = regex.compile(r'(?i)[T]{4,}')

    for item in itemlist:
        # Find all of the potential protospacer sequences, i.e. any 21 nucleotide sequence that precedes a double g

        progress.update(itemlist.index(item)+1)

        spacerMatch = (spacer_re.findall(str(item.seq), overlapped=True) +
                       spacer_re.findall(str(item.reverse_complement().seq), overlapped=True))

        for ps in spacerMatch:
            # Note that ps[4:24] is the actual protospacer.  I need the rest of the sequence for scoring
            ps_seq = Seq(ps[4:24], IUPAC.unambiguous_dna)
            rs = rsb.search(ps_seq)
            # on_target_score_calculator only works if the sequence is in uppercase
            # otherwise, it returns a value of 0.193313360829 for some reason
            score = calc_score(ps.upper())

            # Get rid of anything with T(4+) as those act as RNAPIII terminators
            #if bool(pol3stop.findall(ps[4:24])):
            if "TTTT" in ps[4:24]:
                # TODO Should this also eliminate anything with G(4)?
                pass
            # Get rid of anything that has the verboten restriction sites
            elif bool([y for y in rs.values() if y != []]):
                pass
            # Eliminate potentials with a GC content <20 or >80%
            elif GC(ps_seq) <= 20 or GC(ps_seq) >= 80:
                pass
            elif float(score) < cutoff:
                # explicitly converting cutoff to a float because otherwise it 'sometimes' fails
                # (epecially if you set cutoff to 0.5 on the commandline)
                pass
            # keep everthing else
            else:
                if ps[4:24] not in seen:
                    position = int(str(item.seq).find(ps)) + int(item.description.split("|")[7])
                    keys = ['description','position','score','spacer','offtargetscore','name']
                    values = [item.description, int(position), score, ps[4:24], 100]
                # because of duplicated sequences or whatever, spacelist can end up with duplicate entries
                # so let's take care of them
                # For future!
                #spacer = ProtoSpacer(description=item.description, position=int(position), score=score, spacerseq=ps[4:24], offtargetscore=100)
                #if ps[4:24] not in seen:
                #    spacerset.update(spacer)
                #    seen.append(ps[4:24])
                    spacerlist.append(dict(zip(keys,values)))
                    seen.append(ps[4:24])
    progress.finish()

    if len(spacerlist) == 0:
        print("Sorry, no spacers matching that criteria were found")
        return 0
    else:
        print("Finished finding spacers.  {} spacers found.  Begining Bowtie alignment...".format(len(spacerlist)))

    # write out a file to pass off to Bowtie to use for off-target analysis
    with open('temp.fa', 'w') as tempfile:
        for entry in spacerlist:
            tempfile.writelines(">%s %s %s\n%s\n" % (entry['description'], entry['position'],
                                                     entry['score'], entry['spacer']))

    # delete lists we are not going to use in the future to save on some memory
    del(itemlist)
    del(seen)

    # Use Bowtie to find if this particular sequence has any potential off targets (i. e. two or fewer mismatches)
    # As current, Bowtie is set to return all everything that a particular spacer matches within the reference genome
    # with two or fewer mismatches.
    # TODO switch to Bowtie2 so we can account for gaps in mismatches
    # TODO can we modify this setup to account for NAG PAMs or do we even need to? (i. e. are we already?)
    program = 'bowtie'
    cpus = "-p" + str(cpu_count())
    # maybe tell bowtie to stop looking after a set number of matches

    if reject:
        if largeIndex:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, '--large-index',
                                     refgenome, '-f', 'temp.fa', 'offtargets.fa'])
        else:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, refgenome, '-f',
                                     'temp.fa', 'offtargets.fa'])
    else:
        if largeIndex:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '--large-index',
                                     refgenome, '-f', 'temp.fa', 'offtargets.fa'])
        else:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, refgenome, '-f',
                                     'temp.fa', 'offtargets.fa'])

    print("Bowtie finished.  Begining offtarget analysis...")
    bowtie_results_file = 'offtargets.fa'

    oftcount = 1
    # This count is slow and probably unnecessary
    total_lines = sum(1 for line in open(bowtie_results_file))
    print("Total alignments from Bowtie: {}".format(total_lines))
    new_widgets = ['Scoring for off-targets. Examining: ', progressbar.Counter(), ' ', progressbar.Percentage(),
                   ' ', progressbar.Bar(), progressbar.Timer(), progressbar.ETA()]
    new_progress = progressbar.ProgressBar(widgets=new_widgets, maxval=total_lines).start()

    prunedlist = [] # List to hold spacers whose off-target score is above the minimum cutoff

    mmpos = '[0-9]{1,}'
    mmpos_re = regex.compile(mmpos)

    with open(bowtie_results_file) as offtargetsfile: # parse all the bowtie results into something we can use
        # Read in the first line and parse.
        keys = ['readname', 'strand', 'position', 'mmpositions']
        values = offtargetsfile.readline().strip('\n').split('\t')
        previous_entry = dict(zip(keys,values))
        current_set_of_offtargets = []
        # Read in the next line in the list.
        for line in offtargetsfile:
            line_values = line.strip('\n').split('\t')
            next_entry = dict(zip(keys,line_values))
            new_progress.update(oftcount)
            oftcount += 1
            if next_entry['readname'] == previous_entry['readname']:# Check to see if it is for the same spacer
                current_set_of_offtargets.append(next_entry) # group them together.
            else: # If it isn't, it is time to score that set and move to the next
                # Extract the off-target positions from each entry
                mmlist = []
                badcount = 0
                for entry in current_set_of_offtargets:
                     pos = mmpos_re.findall(entry['mmpositions'])
                     if len(pos) == 0: # Bowtie returns a match for the spacer itself in the genome
                        # if we have two such matches, we should indicate that there is a perfect off-target
                        badcount += 1
                     elif entry['strand'] == '+': mmlist.append([int(w) for w in pos])
                     elif entry['strand'] == '-': mmlist.append([19-int(w) for w in pos])
                # Find the spacer in spacerlist to which these off-targets belong and set that spacer's offtarget score
                matching_spacer = next((spacer for spacer in spacerlist if (previous_entry['readname'] ==
                    '{} {} {}'.format(spacer['description'], spacer['position'],str(spacer['score'])))), None)
                # Tally up the offtarget score for the set of off targets and set the spacer's off-target score
                if badcount > 2:
                    matching_spacer['offtargetscore'] = 0
                # Speed this up by rejecting things with over a certain set of matching off-targets
                elif reject and len(mmlist) > reject:
                    matching_spacer['offtargetscore'] = 0
                else:
                    matching_spacer['offtargetscore'] = sumofftargets(mmlist)

                if float(matching_spacer['offtargetscore']) >= float(offtargetcutoff):
                    prunedlist.append(matching_spacer)
                # Dump the previous set.
                current_set_of_offtargets = []
                # Start a new set of off-targets using this new line
                previous_entry = next_entry
                current_set_of_offtargets.append(next_entry)

    new_progress.finish()

    print("\nFinished scoring off-targets")

    finallist = []
    if len(prunedlist) == 0:
        print("No spacers were found that correspond to the parameters you set.")
    else:
        if len(prunedlist[0]['description'].split('|')) == 9: # need to make sure the header format is correct
            for entry in prunedlist:
                finallist.append(FormattedResult(entry))
            # Create a set of all the GeneIDs in our list
            geneset = set([y.GeneID for y in finallist])
            toplist = []
            for z in geneset:
                all_spacers_for_gene = [a for a in finallist if a.GeneID == z]
                ranked_spacers = sorted(all_spacers_for_gene, key=attrgetter('score','offtargetscore'), reverse=True)
                if return_limit == 'all':
                    for w in ranked_spacers: toplist.append(w)
                elif len(ranked_spacers) > int(return_limit):
                    for w in range(0,int(return_limit)): toplist.append(ranked_spacers[w])
                else:
                    for w in range(0,len(ranked_spacers)-1): toplist.append(ranked_spacers[w])
            olist = map(lambda entry: [entry.GeneID, entry.GeneName, entry.seq, entry.GC, entry.position,
                                     entry.score, entry.offtargetscore], toplist)
            headerlist = ['GeneID', 'GeneName', 'seq','%GC','position', 'score', 'off-target score']
        else: # if it isn't, just dump all the spacers we found into a file
            finallist = spacerlist
            olist = map(lambda entry: [entry.id.split(' ')[0], entry.seq, GC(entry.seq), entry.position.split(' ')[1],
                                           entry.score.split(' ')[2]], finallist)
            headerlist = ['ID', 'seq', '%GC','position', 'score']

        print("Writing file.")
        try:
            with open(outfile, 'w') as ofile:
                output = csv.writer(ofile, dialect='excel')
                output.writerows([headerlist])
                output.writerows(olist)
        except IOError:
            print("There is trouble writing to the file.  Perhaps it is open in another application?")
            choice = input("Would you like to try again? [y/n]")
            if choice == 'y' or choice == 'Y':
                try:
                    with open(outfile, 'w') as ofile:
                        output = csv.writer(ofile, dialect='excel')
                        output.writerows([headerlist])
                        output.writerows(olist)
                except:
                    print("Sorry, still was unable to write to the file")
            else:
                return 0

        print("Finished.")
Ejemplo n.º 32
0
def gen_enzyme_religation_regex(enzyme):
    """Return a regex which corresponds to all possible religation sites given a
    set of enzyme.
    Parameters:
    -----------
    enzyme : str
        String that contains the names of the enzyme separated by a comma.
    Returns:
    --------
    re.Pattern :
        Regex that corresponds to all possible ligation sites given a set of
        enzyme.
    Examples:
    ---------
    >>> gen_enzyme_religation_regex('HpaII')
    re.compile('CCGCGG')
    >>> gen_enzyme_religation_regex('HpaII,MluCI')
    re.compile('AATTAATT|AATTCGG|CCGAATT|CCGCGG')
    """

    # Split the str on the comma to separate the different enzymes.
    enzyme = enzyme.split(",")

    # Check on Biopython dictionnary the enzyme.
    rb = RestrictionBatch(enzyme)

    # Initiation:
    give_list = []
    accept_list = []
    ligation_list = []

    # Iterates on the enzymes.
    for enz in rb:

        # Extract restriction sites and look for cut sites.
        site = enz.elucidate()
        fw_cut = site.find("^")
        rev_cut = site.find("_")

        # Process "give" site. Remove N on the left (useless).
        give_site = site[:rev_cut].replace("^", "")
        while give_site[0] == "N":
            give_site = give_site[1:]
        give_list.append(give_site)

        # Process "accept" site. Remove N on the rigth (useless).
        accept_site = site[fw_cut + 1:].replace("_", "")
        while accept_site[-1] == "N":
            accept_site = accept_site[:-1]
        accept_list.append(accept_site)

    # Iterates on the two list to build all the possible HiC ligation sites.
    for give_site in give_list:
        for accept_site in accept_list:
            # Replace "N" by "." for regex searching of the sites
            ligation_list.append((give_site + accept_site).replace("N", "."))
            ligation_list.append(
                str(Seq(give_site + accept_site).reverse_complement()).replace(
                    "N", "."))

    # Build the regex for any ligation sites.
    pattern = "|".join(sorted(list(set(ligation_list))))
    return re.compile(pattern)
Ejemplo n.º 33
0
class DigestedSequence:
    def __init__(self, enzymes, sequence, is_linear=True):
        self.enzymes = enzymes
        self.sequence = sequence
        self.res_batch = RestrictionBatch(enzymes)
        self.is_linear = is_linear
        self.site_dict = self.res_batch.search(self.sequence.seq, is_linear)

    def get_sites(self):
        """
        Return the set of sites for a given contig, ordered by increasing position.
        :return: list of CutSites
        """
        cutSites = []
        for e_name, ctg_locs in self.site_dict.iteritems():
            for loc in ctg_locs:
                cutSites.append(CutSite(e_name, loc))

        return sorted(cutSites)

    def get_fragments(self):
        """
        Return the genomic fragments resulting from the digestion.
        :return: list of SeqRecords.
        """
        sites = self.get_sites()
        seq = self.sequence

        if len(sites) == 0:
            return seq

        frags = []
        for idx in xrange(1, len(sites)):
            a = sites[idx - 1]
            b = sites[idx]
            frg = seq[a:b]
            frg.id = '{0}:{1}:{2}'.format(frg.id, a, b)
            frg.name = frg.id
            frg.description = 'restriction digest fragment from {0} to {1}'.format(
                a, b)
            frags.append(frg)

        return frags

    @staticmethod
    def digestion_sites(seq_list, enzyme_names=[], min_sites=1):
        """
        Return a list of sites per sequence, preserving the input list order.
        :param seq_list: list of sequences to analyze
        :param enzyme_names: enzyme used in digestion
        :param min_sites: minimum sites required for a sequence to be included.
        :param min_length: minimum sequence length to be included.
        :return: list of sites per sequence
        """
        sites = []
        for seq in seq_list:
            if seq['excluded']:
                continue

            ds = DigestedSequence(enzyme_names, seq['record'])
            seq_sites = ds.get_sites()

            if len(seq_sites) < min_sites:
                print '\tExcluded {0} (length {1}) with only {2} sites'.format(
                    seq['record'].id, len(seq['record']), len(seq_sites))
                #continue

            sites.append({'name': seq['record'].id, 'pos': seq_sites})
        return sites
Ejemplo n.º 34
0
 def __init__(self, enzymes, sequence, is_linear=True):
     self.enzymes = enzymes
     self.sequence = sequence
     self.res_batch = RestrictionBatch(enzymes)
     self.is_linear = is_linear
     self.site_dict = self.res_batch.search(self.sequence.seq, is_linear)
Ejemplo n.º 35
0
def write_frag_info(
    fasta,
    enzyme,
    size=DEFAULT_THRESHOLD_SIZE,
    circular=False,
    output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME,
    output_frags=DEFAULT_FRAGMENTS_LIST_FILE_NAME,
    output_dir=None,
):
    """Write the fragments_list.txt and info_contigs.txt that are necessary
    for GRAAL to run
    """

    try:
        my_enzyme = RestrictionBatch([enzyme]).get(enzyme)
    except ValueError:
        my_enzyme = max(int(enzyme), DEFAULT_MIN_CHUNK_SIZE)

    records = SeqIO.parse(fasta, "fasta")

    try:
        info_contigs_path = os.path.join(output_dir, output_contigs)
        frag_list_path = os.path.join(output_dir, output_frags)
    except AttributeError:
        info_contigs_path = output_contigs
        frag_list_path = output_frags

    with open(info_contigs_path, "w") as info_contigs:

        info_contigs.write("contig\tlength_kb\tn_frags\tcumul_length\n")

        with open(frag_list_path, "w") as fragments_list:

            fragments_list.write("id\tchrom\tstart_pos"
                                 "\tend_pos\tsize\tgc_content\n")

            total_frags = 0

            for record in records:
                my_seq = record.seq
                contig_name = record.id
                contig_length = len(my_seq)
                if contig_length < int(size):
                    continue
                try:
                    my_frags = my_enzyme.catalyze(my_seq, linear=not circular)
                except AttributeError:
                    n = len(my_seq)
                    my_frags = (my_seq[i:min(i + my_enzyme, n)]
                                for i in range(0, len(my_seq), my_enzyme))
                n_frags = 0

                current_id = 1
                start_pos = 0
                for frag in my_frags:
                    size = len(frag)
                    if size > 0:
                        end_pos = start_pos + size
                        gc_content = SeqUtils.GC(frag) / 100.0

                        current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % (
                            current_id,
                            contig_name,
                            start_pos,
                            end_pos,
                            size,
                            gc_content,
                        )

                        fragments_list.write(current_fragment_line)

                        try:
                            assert (current_id == 1
                                    and start_pos == 0) or (current_id > 1
                                                            and start_pos > 0)
                        except AssertionError:
                            print((current_id, start_pos))
                            raise
                        start_pos = end_pos
                        current_id += 1
                        n_frags += 1

                current_contig_line = "%s\t%s\t%s\t%s\n" % (
                    contig_name,
                    contig_length,
                    n_frags,
                    total_frags,
                )
                total_frags += n_frags
                info_contigs.write(current_contig_line)
#
# Created:     11/04/2013
#-------------------------------------------------------------------------------

from Bio import Entrez
from Bio import SeqIO
from Bio.Restriction import Restriction
from Bio.Restriction import RestrictionBatch

email="*****@*****.**"
Entrez.email = email
fetch_seq = Entrez.efetch(db="nucleotide", rettype="fasta",retmode="text", id="294489415")
seq_record = SeqIO.read(fetch_seq, "fasta")
fetch_seq.close()

# To see how a specific restriction enzyme (Sau3AI) would digest your sequence:
print("Restriction site is", Restriction.Sau3AI.site)
digest = Restriction.Sau3AI.catalyse(seq_record.seq)
print ("Number of fragments is", len(digest))
print "------\nLengths of each fragment\n------"
for lengths in digest:
	print len(lengths)

# Run every restriction enzyme in the New England Biolabs database against the sequence
rb_supp = RestrictionBatch(first=[], suppliers=['N'])
for rest in rb_supp.search(seq_record.seq):
	# The code commented out below will only show restriction enzymes that created a number of fragments between 10 and 40
    #if len(rb_supp.search(seq_record.seq)[rest]) > 10 and len(rb_supp.search(seq_record.seq)[rest]) < 40:
    # This will show every restriction ezyme that was able to digest the sequence in some way.
    if len(rb_supp.search(seq_record.seq)[rest]) > 0:
        print rest, ":", rb_supp.search(seq_record.seq)[rest]
Ejemplo n.º 37
0
def main(protein_fasta_open_file, list_codon_usage_open_files, output_destination, restriction_enzymes=""):
    # parse protein
    record = Parser.parse_fasta_file(protein_fasta_open_file)
    name, id, sequence = record.name, record.id, record.seq
    creatures = {}
    # parse table
    if len(list_codon_usage_open_files) == 0:
        raise Exception("Error: Empty codon table filnames")
    # parses organism files , assuming they are already open

    for fname, open_file in list_codon_usage_open_files:
        creature_name = fname.split('.')[0]
        codon_usage_dict, codon_to_protein_dict, AA_list = Parser.parse_kazusa_codon_usage_table(open_file)
        creatures[creature_name] = codon_usage_dict, codon_to_protein_dict, AA_list

    # creates AA
    Amino_Acids_obj_list = []
    AA_LIST = creatures[creature_name][2]
    codon_to_protein_dict = creatures[creature_name][1]
    for aa in AA_LIST:
        AA = AminoAcid.AminoAcid(aa, codon_to_protein_dict)
        Amino_Acids_obj_list.append(AA)
    for creature_name, creature_tuple in creatures.items():
        codon_usage_dict, codon_to_protein_dict, AA_list = creature_tuple
        for AA in Amino_Acids_obj_list:
            AA.add_organism_codons(codon_usage_dict, creature_name)

    prot_analisys = ProtParam.ProteinAnalysis(sequence._data)
    aa_count_dict = prot_analisys.count_amino_acids()

    # replaces aa with codons from codon pool
    ouput_protein_list = Calculator.compute_and_Switch(Amino_Acids_obj_list, sequence, aa_count_dict)
    final_sequence = "".join(ouput_protein_list)
    final_sequence = final_sequence.replace("U", "T")
    # analyse final sequance
    if len(final_sequence) != len(sequence) * 3:
        raise Exception("final sequance length does not match input sequence length")
    # output_file_name = os.path.join(output_destination, "Ouput.fasta")
    record = SeqRecord.SeqRecord(Seq(final_sequence, ), name=name)
    if record.translate().seq != sequence:
        raise Exception("error- resulting DNA does not translate back to protein")

    # restriction enzymes- verifies they do not cut the sequence. if they do, pick the least cut sequence
    if restriction_enzymes != "":
        restriction_enzymes_list = restriction_enzymes.replace(",", " ").replace('\n', ' ').replace("\t", " ").split()
        batch = RestrictionBatch(restriction_enzymes_list)
        num_cutting = len(check_restriction(Seq(final_sequence, generic_dna), batch))
        best_num_cutting = np.inf
        best_sequ = final_sequence
        iterations = 100
        no_enzymes_cut = num_cutting == 0
        # if the original sequence had a restriction site, repeat the sequence building 100 times , or until
        # a non- cut sequence is found
        while iterations > 0 and num_cutting > 0:
            ouput_protein_list = Calculator.compute_and_Switch(Amino_Acids_obj_list, sequence, aa_count_dict)
            final_sequence = "".join(ouput_protein_list)
            final_sequence = final_sequence.replace("U", "T")
            # analyse final sequance
            if len(final_sequence) != len(sequence) * 3:
                raise Exception("final sequance length does not match input sequence length")
            # output_file_name = os.path.join(output_destination, "Ouput.fasta")
            record = SeqRecord.SeqRecord(Seq(final_sequence, generic_dna), name=name)
            if record.translate().seq != sequence:
                print("error- resulting DNA does not translate back to protein")
                exit(1)
            # if achieved non cutting sequence, save and return
            num_cutting = len(check_restriction(Seq(final_sequence, generic_dna), batch))
            if num_cutting == 0:
                check_restriction(Seq(final_sequence, generic_dna), batch, to_print=True)
                print("printing to output file....")
                SeqIO.write(record, output_destination, "fasta")
                print("ouput sucsessful")
                return "Output Sucsessful"
            best_num_cutting = min(best_num_cutting, num_cutting)
            if best_num_cutting == num_cutting:
                best_sequ = final_sequence

            iterations -= 1
        # return best sequence, as in one that is cut by the least amount of restriction enzymes
        if best_num_cutting > 0:
            cutting = check_restriction(Seq(best_sequ, generic_dna), batch, to_print=True)
            record = SeqRecord.SeqRecord(Seq(best_sequ, generic_dna), name=name)
            SeqIO.write(record, output_destination, "fasta")
            return "The enzymes the cut the sequence are:" + str(cutting) + "\n Output printed to specified location."

    SeqIO.write(record, output_destination, "fasta")
    return "ouput sucsessful"
Ejemplo n.º 38
0
    def test_creating_batch(self):
        """Creating and modifying a restriction batch."""
        batch = RestrictionBatch([EcoRI])
        batch.add(KpnI)
        batch += EcoRV
        self.assertEqual(len(batch), 3)

        # The usual way to test batch membership
        self.assertIn(EcoRV, batch)
        self.assertIn(EcoRI, batch)
        self.assertIn(KpnI, batch)
        self.assertNotIn(SmaI, batch)
        # Syntax sugar for the above
        self.assertIn('EcoRV', batch)
        self.assertNotIn('SmaI', batch)

        batch.get(EcoRV)
        self.assertRaises(ValueError, batch.get, SmaI)

        batch.remove(EcoRV)
        self.assertEqual(len(batch), 2)

        self.assertNotIn(EcoRV, batch)
        self.assertNotIn('EcoRV', batch)

        # Create a batch with suppliers and other supplier related methods
        # These tests may be 'update sensitive' since company names and
        # products may change often...
        batch = RestrictionBatch((), ('S'))  # Sigma
        self.assertEqual(batch.current_suppliers(),
                         ['Sigma Chemical Corporation'])
        self.assertIn(EcoRI, batch)
        self.assertNotIn(AanI, batch)
        batch.add_supplier('B')  # Life Technologies
        self.assertIn(AanI, batch)