def digest_genome(genome_fp, restriction_enzyme, output_dir, linear=False): base_fp = os.path.basename(genome_fp) if '.' in base_fp: base_fp = '{}.{}.fragments.bed'.format(base_fp[:base_fp.rfind('.')], restriction_enzyme) else: base_fp = '{}.{}.fragments.bed'.format(base_fp, restriction_enzyme) base_fp = os.path.join(output_dir, base_fp) if os.path.isfile(base_fp): overwrite = input( 'WARNING: Overwriting existing fragment BED {}. Continue? [y/N]'. format(base_fp)) if not overwrite.lower() == 'y': print("Did not overwrite existing fragment BED.") return os.remove(base_fp) print("Digesting") genome = None if "fasta" in genome_fp or "fa" in genome_fp: genome = SeqIO.parse(open(genome_fp, "rU"), format='fasta') else: genome = SeqIO.parse(open(genome_fp, "rU"), format='genbank') for chromosome in genome: print('{}\t{}'.format(chromosome.id, len(chromosome.seq))) # Digest the sequence data and return the cut points enzyme = RestrictionBatch([restriction_enzyme]) for enzyme, cutpoints in enzyme.search(chromosome.seq, linear=linear).items(): if len(cutpoints) == 0: print('No restriction sites found for {}'.format( chromosome.id)) continue df = pd.DataFrame(cutpoints, columns=['cutpoint']) df['end'] = df.cutpoint - 1 df['start'] = df.end - (df.cutpoint.diff()) df.loc[0, 'start'] = 0 df['start'] = df['start'].astype('Int64') if len(df) > 1: last_fragment = pd.DataFrame({ 'start': [df.loc[len(df) - 1, 'end']], 'end': [len(chromosome.seq)], 'cutpoint': [-1] }) df = df.append(last_fragment, ignore_index=True) else: df.loc[len(df) - 1, 'end'] = len(chromosome.seq) df['frag_id'] = df.index # chromosome has 'chr' accession = chromosome.id version = '' if "." in chromosome.id: accession, version = chromosome.id.split(".") if not accession.startswith("chr"): accession = "chr" + accession df['chr'] = accession df[['chr', 'start', 'end', 'frag_id']].to_csv(base_fp, index=False, sep='\t', mode='a', header=None)
def REsearch(goi='', goiFile='', mcs='', mcsFile=''): rb = RestrictionBatch(suppliers=[ 'C', 'B', 'E', 'I', 'K', 'J', 'M', 'O', 'N', 'Q', 'S', 'R', 'V', 'Y', 'X' ]) goi = Seq(goi, IUPACUnambiguousDNA()) if goi else read_seq(goiFile) if not goi: raise Exception('Please provide a GOI sequence!') mcs = Seq(mcs, IUPACUnambiguousDNA()) if mcs else read_seq(mcsFile) if not mcs: raise Exception('Please provide a MCS sequence!') result_mcs = rb.search(mcs) result_goi = rb.search(goi) REs = set([e for e in result_mcs.keys() if result_mcs[e]]) - set( [e for e in result_goi.keys() if result_goi[e]]) # ana = Analysis(RestrictionBatch(list(REs)), mcs) # REs_sorted = sorted(REs, key=lambda e: result_mcs[e]) # result = {e: result_mcs[e] for e in REs_sorted} r = [] for e in REs: for site in result_mcs[e]: r.append((str(e), site, "blunt" if e.is_blunt() else e.elucidate(), ' '.join(e.suppl))) r.sort(key=lambda i: i[1]) return r
def has_restriction_site(seq): from Bio.Seq import Seq from Bio.Restriction import RestrictionBatch mix = RestrictionBatch(restriction_sites) hits = mix.search(Seq(seq)) return any(hits.values())
def test_batch_analysis(self): """Sequence analysis with a restriction batch.""" seq = Seq("AAAA" + EcoRV.site + "AAAA" + EcoRI.site + "AAAA") batch = RestrictionBatch([EcoRV, EcoRI]) hits = batch.search(seq) self.assertEqual(hits[EcoRV], [8]) self.assertEqual(hits[EcoRI], [16])
def test_batch_analysis(self): """Sequence analysis with a restriction batch.""" seq = Seq("AAAA" + EcoRV.site + "AAAA" + EcoRI.site + "AAAA", IUPACAmbiguousDNA()) batch = RestrictionBatch([EcoRV, EcoRI]) hits = batch.search(seq) self.assertEqual(hits[EcoRV], [8]) self.assertEqual(hits[EcoRI], [16])
def apply_restricts(dna, restricts, circular=False): '''Applies restriction site cleavage to forward and reverse strands.''' out_dnas = [dna] for restrict in restricts: batch = RestrictionBatch() batch.add(str(restrict)) restrict = batch.get(str(restrict)) out_dnas = _apply_restrict_to_dnas(out_dnas, restrict, circular) return out_dnas
def OnDistribution(self, event): fasta_file = self.genome_name.GetValue() enzyme = RestrictionBatch([self.site_enzyme.GetStringSelection()]).get(self.site_enzyme.GetStringSelection()) genome = "" if fasta_file and enzyme: with open(fasta_file, "rU") as handle: for record in SeqIO.parse(handle, "fasta") : genome += record.seq plt.hist([len(i) for i in enzyme.catalyse(genome)],alpha=.3, bins=1000) plt.show()
def test_analysis_restrictions(self): """Test Fancier restriction analysis.""" new_seq = Seq('TTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA', IUPACAmbiguousDNA()) rb = RestrictionBatch([EcoRI, KpnI, EcoRV]) ana = Analysis(rb, new_seq, linear=False) # Output only the result for enzymes which cut blunt: self.assertEqual(ana.blunt(), {EcoRV: []}) self.assertEqual(ana.full(), {KpnI: [], EcoRV: [], EcoRI: [33]}) # Output only the result for enzymes which have a site: self.assertEqual(ana.with_sites(), {EcoRI: [33]}) # Output only the enzymes which have no site: self.assertEqual(ana.without_site(), {KpnI: [], EcoRV: []}) self.assertEqual(ana.with_site_size([32]), {}) # Output only enzymes which produce 5' overhangs self.assertEqual(ana.overhang5(), {EcoRI: [33]}) # Output only enzymes which produce 3' overhangs self.assertEqual(ana.overhang3(), {KpnI: []}) # Output only enzymes which produce defined ends self.assertEqual(ana.defined(), {KpnI: [], EcoRV: [], EcoRI: [33]}) # Output only enzymes hich cut N times self.assertEqual(ana.with_N_sites(2), {}) # The enzymes which cut between position x and y: with self.assertRaises(TypeError): ana.only_between('t', 20) with self.assertRaises(TypeError): ana.only_between(1, 't') self.assertEqual(ana.only_between(1, 20), {}) self.assertEqual(ana.only_between(20, 34), {EcoRI: [33]}) # Mix start/end order: self.assertEqual(ana.only_between(34, 20), {EcoRI: [33]}) self.assertEqual(ana.only_outside(20, 34), {}) with self.assertWarns(BiopythonWarning): ana.with_name(['fake']) self.assertEqual(ana.with_name([EcoRI]), {EcoRI: [33]}) self.assertEqual((ana._boundaries(1, 20)[:2]), (1, 20)) # Reverse order: self.assertEqual((ana._boundaries(20, 1)[:2]), (1, 20)) # Fix negative start: self.assertEqual((ana._boundaries(-1, 20)[:2]), (20, 33)) # Fix negative end: self.assertEqual((ana._boundaries(1, -1)[:2]), (1, 33)) # Sites in- and outside of boundaries new_seq = Seq('GAATTCAAAAAAGAATTC', IUPACAmbiguousDNA()) rb = RestrictionBatch([EcoRI]) ana = Analysis(rb, new_seq) # Cut at least inside self.assertEqual(ana.between(1, 7), {EcoRI: [2, 14]}) # Cut at least inside and report only inside site self.assertEqual(ana.show_only_between(1, 7), {EcoRI: [2]}) # Cut at least outside self.assertEqual(ana.outside(1, 7), {EcoRI: [2, 14]}) # Don't cut within self.assertEqual(ana.do_not_cut(7, 12), {EcoRI: [2, 14]})
def re_sites(self, sequence): seq = Seq(sequence, IUPACAmbiguousDNA) # Set up analysis class with our enzymes and seq rb = RestrictionBatch(self.enzyme_set) # Do digest and reformat to dict of {site: enz, site:enz} re_sites = {} for enzyme, cutsites in rb.search(seq).items(): for cut in cutsites: cut = cut + enzyme.fst3 - 1 re_sites[cut] = enzyme return sorted(re_sites.items())
def test_creating_batch(self): """Creating and modifying a restriction batch. """ batch = RestrictionBatch([EcoRI]) batch.add(KpnI) batch += EcoRV self.assertEqual(len(batch), 3) # The usual way to test batch membership self.assertIn(EcoRV, batch) self.assertIn(EcoRI, batch) self.assertIn(KpnI, batch) self.assertNotIn(SmaI, batch) # Syntax sugar for the above self.assertIn('EcoRV', batch) self.assertNotIn('SmaI', batch) batch.get(EcoRV) self.assertRaises(ValueError, batch.get, SmaI) batch.remove(EcoRV) self.assertEqual(len(batch), 2) self.assertNotIn(EcoRV, batch) self.assertNotIn('EcoRV', batch)
def test_analysis_restrictions(self): """Test Fancier restriction analysis """ new_seq = Seq('TTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA', IUPACAmbiguousDNA()) rb = RestrictionBatch([EcoRI, KpnI, EcoRV]) ana = Analysis(rb, new_seq, linear=False) self.assertEqual( ana.blunt(), {EcoRV: []}) # output only the result for enzymes which cut blunt self.assertEqual(ana.full(), {KpnI: [], EcoRV: [], EcoRI: [33]}) self.assertEqual( ana.with_sites(), {EcoRI: [33] }) # output only the result for enzymes which have a site self.assertEqual(ana.without_site(), { KpnI: [], EcoRV: [] }) # output only the enzymes which have no site self.assertEqual(ana.with_site_size([32]), {}) self.assertEqual(ana.only_between(1, 20), {}) # the enzymes which cut between position 1 and 20 self.assertEqual(ana.only_between(20, 34), {EcoRI: [33]}) # etc... self.assertEqual(ana.only_between(34, 20), {EcoRI: [33]}) # mix start end order self.assertEqual(ana.only_outside(20, 34), {}) with self.assertWarns(BiopythonWarning): ana.with_name(['fake']) self.assertEqual(ana.with_name([EcoRI]), {EcoRI: [33]}) self.assertEqual((ana._boundaries(1, 20)[:2]), (1, 20)) self.assertEqual((ana._boundaries(20, 1)[:2]), (1, 20)) # reverse order self.assertEqual((ana._boundaries(-1, 20)[:2]), (20, 33)) # fix negative start
def __init__(be, bp_enzyme, price=None, units=None): enzyme.__init__(be, price, units) be.restriction_site = bp_enzyme.site be.site_len = len(be.restriction_site) be.name = RestrictionBatch([bp_enzyme]).as_string()[0] be.cut0 = bp_enzyme.charac[0] be.cut1 = bp_enzyme.charac[1] + be.site_len be.bp = True be.bp_enzyme = bp_enzyme
def calc_digest_products(seq, enzymes, *, is_circular): from more_itertools import pairwise, flatten from Bio.Restriction import RestrictionBatch from Bio.Seq import Seq if not enzymes: raise UsageError("no enzymes specified", enzymes=enzymes) enzymes = [re.sub('-HF(v2)?$', '', x) for x in enzymes] try: batch = RestrictionBatch(enzymes) except ValueError: raise ConfigError( lambda e: f"unknown enzyme(s): {','.join(map(repr, e.enzymes))}", enzymes=enzymes, ) from None sites = [x - 1 for x in flatten(batch.search(Seq(seq)).values())] if not sites: raise ConfigError( lambda e: f"{','.join(map(repr, e.enzymes))} {plural(enzymes):/does/do} not cut template.", enzymes=enzymes, seq=seq, ) sites += [] if is_circular else [0, len(seq)] sites = sorted(sites) seqs = [] for i, j in pairwise(sorted(sites)): seqs.append(seq[i:j]) if is_circular: wrap_around = seq[sites[-1]:] + seq[:sites[0]] seqs.append(wrap_around) return seqs
def fetch_restriction_sites(self, enzymes="Common"): """ Spike in target variant first, generate list restriction enzymes that will work. """ if enzymes == "ALL": enzyme_group = AllEnzymes elif enzymes == "Common": enzyme_group = CommOnly elif enzymes == "HF": enzyme_group = high_fidelity else: enzyme_group = RestrictionBatch(enzymes.split(",")) # Filter ambiguous cutters enzyme_group = RestrictionBatch( [x for x in enzyme_group if x.is_ambiguous() is False]) # Calculate rflps for ALT sites only self.ref_sites = dict(list(enzyme_group.search(self.ref_seq).items())) self.primary_variant_sites = dict( list(enzyme_group.search(self.primary_variant_seq).items())) self.rflps = { k: (self.ref_sites[k], self.primary_variant_sites[k]) for k, v in list(self.ref_sites.items()) if len(v) > 0 and len(v) <= 3 and self.ref_sites[k] != self.primary_variant_sites[k] }
def restriction_sites_present(spacer: str, rsb: RestrictionBatch) -> List[int]: """Determine if and where a set of restriction sites are present in a sequence\f Parameters ---------- spacer : `str` Spacer sequence to examine for restriction sites. Returns ------- :class:`typing.List`[`int`] """ sites = bool([_ for results in rsb.search(Seq(spacer)).values() for _ in results]) return sites
def get_restriction_site(enzyme): """Function to return a regex which corresponds to all possible restriction sites given a set of enzyme. Parameters: ----------- enzyme : str String that contains the names of the enzyme separated by a comma. Returns: -------- str : Regex that corresponds to all possible restriction sites given a set of enzyme. Examples: --------- >>> get_restriction_site('DpnII') 'GATC' >>> get_restriction_site('DpnII,HinfI') 'GA.TC|GATC' """ # Split the str on the comma to separate the different enzymes. enzyme = enzyme.split(",") # Check on Biopython dictionnary the enzyme. rb = RestrictionBatch(enzyme) # Initiation: restriction_list = [] # Iterates on the enzymes. for enz in rb: # Extract restriction sites and look for cut sites. restriction_list.append(enz.site.replace("N", ".")) # Build the regex for all retsriction sites. pattern = "|".join(sorted(list(set(restriction_list)))) return pattern
from vcfkit.utils.primer3 import primer3 from subprocess import Popen, PIPE, check_output from .reference import resolve_reference_genome np.set_printoptions(threshold=sys.maxsize) from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) from Bio.Seq import Seq from Bio.Restriction import AllEnzymes, CommOnly, RestrictionBatch # Global flag for header output header_printed = False high_fidelity = RestrictionBatch([ "AgeI", "ApoI", "BamHI", "BbsI", "BmtI", "BsaI", "BsiWI", "BsrGI", "BstEII", "BstZ17I", "DraIII", "EagI", "EcoRI", "EcoRV", "HindIII", "KpnI", "MfeI", "MluI", "NcoI", "NheI", "NotI", "NruI", "NsiI", "PstI", "PvuI", "PvuII", "SacI", "SalI", "SbfI", "ScaI", "SpeI", "SphI", "SspI", "StyI" ]) debug = None class cvariant: """ Mutable variant object """ def __init__(self, variant): for i in [x for x in dir(variant) if x.startswith("_") is False]: setattr(self, i, getattr(variant, i))
self.layer = 'Restriction Enzymes' def to_dict(self): r = super(Restriction_Site, self).to_dict() r['elucidate'] = self.enzyme.elucidate() r['cut'] = self.cut return r _MyEnzymes = [ AatII, AflII, AgeI, ApaI, ApaLI, AscI, AseI, BamHI, BclI, BglII, BstBI, ClaI, DraI, EagI, EarI, EcoRI, EcoRV, FspI, HindIII, HpaI, KpnI, MscI, NarI, NcoI, NdeI, NheI, NotI, NruI, PacI, PmlI, PstI, PvuII, SacI, SacII, SalI, SmaI, SpeI, StuI, XbaI, XhoI, XmaI ] MyEnzymes = RestrictionBatch( [x for x in _MyEnzymes if x.elucidate().find('^') >= 0]) def find_restriction_sites(sequence, circular=True): input_seq = clean_sequence(sequence) if circular is True: input2 = Seq(input_seq + input_seq) else: input2 = Seq(input_seq) r = MyEnzymes.search(input2) cutter_list = [] for enzyme in r: v = r[enzyme] for cut in v: cut_after = cut - 1 if cut_after <= 0:
def test_creating_batch(self): """Creating and modifying a restriction batch.""" batch = RestrictionBatch() self.assertEqual(batch.suppl_codes()['N'], 'New England Biolabs') self.assertTrue(batch.is_restriction(EcoRI)) batch = RestrictionBatch([EcoRI]) batch.add(KpnI) batch += EcoRV self.assertEqual(len(batch), 3) self.assertEqual(batch.elements(), ['EcoRI', 'EcoRV', 'KpnI']) # Problem with Python 3, as sequence of list may be different: # self.assertEqual(batch.as_string(), ['EcoRI', 'KpnI', 'EcoRV']) self.assertIn('EcoRI', batch.as_string()) # The usual way to test batch membership self.assertIn(EcoRV, batch) self.assertIn(EcoRI, batch) self.assertIn(KpnI, batch) self.assertNotIn(SmaI, batch) # Syntax sugar for the above self.assertIn('EcoRV', batch) self.assertNotIn('SmaI', batch) batch.get(EcoRV) self.assertRaises(ValueError, batch.get, SmaI) batch.get(SmaI, add=True) self.assertEqual(len(batch), 4) batch.remove(SmaI) batch.remove(EcoRV) self.assertEqual(len(batch), 2) self.assertNotIn(EcoRV, batch) self.assertNotIn('EcoRV', batch) # Creating a batch by addition of restriction enzymes new_batch = EcoRI + KpnI self.assertEqual(batch, new_batch) # or by addition of a batch with an enzyme another_new_batch = new_batch + EcoRV new_batch += EcoRV self.assertEqual(another_new_batch, new_batch) self.assertRaises(TypeError, EcoRI.__add__, 1) # Create a batch with suppliers and other supplier related methods # These tests may be 'update sensitive' since company names and # products may change often... batch = RestrictionBatch((), ('S')) # Sigma self.assertEqual(batch.current_suppliers(), ['Sigma Chemical Corporation']) self.assertIn(EcoRI, batch) self.assertNotIn(AanI, batch) batch.add_supplier('B') # Life Technologies self.assertIn(AanI, batch)
def test_dseq(): import textwrap from pydna.dseq import Dseq obj1 = Dseq("a", "t", circular=True) obj2 = Dseq("a", "t") with pytest.raises(TypeError): obj1 + obj2 with pytest.raises(TypeError): obj2 + obj1 with pytest.raises(TypeError): obj1 + "" with pytest.raises(AttributeError): obj2 + "" obj1 = Dseq("at", "t") obj2 = Dseq("a", "t") with pytest.raises(TypeError): obj1 + obj2 obj = Dseq("aaa", "ttt", circular=True) assert obj[1:2] == Dseq("a", "t", 0) assert obj[:] == Dseq("aaa", "ttt", circular=False) obj = Dseq("atg", "cat", 0, circular=False) assert obj[1:2]._data == "atg"[1:2] assert obj[2:1]._data == "atg"[2:1] assert obj.reverse_complement() == obj.rc() == Dseq("cat", "atg", 0) obj = Dseq("atg", "cat", circular=True) assert obj.looped() == obj assert obj[:] == Dseq("atg", "cat", 0, circular=False) assert obj[1:2]._data == "atg"[1:2] assert obj[2:1]._data == "ga" obj = Dseq("G", "", 0) assert obj.five_prime_end() == ("5'", "g") obj = Dseq("", "C", 0) assert obj.five_prime_end() == ("3'", "c") obj = Dseq("ccGGATCC", "aaggatcc", -2) assert obj._data == "ccGGATCCtt" assert str(obj.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) ccGGATCC cctaggaa """ ).strip() assert repr(obj) == rpr assert obj[3] == Dseq("G", "c", 0) assert obj.fill_in() == Dseq("ccGGATCCtt", "aaggatccgg", 0) assert obj + Dseq("") == obj assert Dseq("") + obj == obj obj = Dseq("gatcAAAAAA", "gatcTTTTTT") assert obj.fill_in("gatc") == Dseq("gatcAAAAAAgatc", "gatcTTTTTTgatc") assert obj.fill_in("atc") == obj assert obj.fill_in("ac") == obj assert obj.fill_in("at") == obj obj = Dseq("AAAAAAgatc", "TTTTTTgatc") assert obj.fill_in("gatc") == obj assert obj.fill_in("atc") == obj assert obj.fill_in("ac") == obj assert obj.fill_in("at") == obj obj = Dseq("gatcAAAAAA", "gatcTTTTTT") assert obj.t4() == Dseq("gatcAAAAAAgatc", "gatcTTTTTTgatc") assert obj.t4("at") == obj assert obj.t4("atg") == Dseq("gatcAAAAAAgat", "gatcTTTTTTgat") assert obj.t4("atgc") == Dseq("gatcAAAAAAgatc", "gatcTTTTTTgatc") assert obj.mung() == Dseq("AAAAAA", "TTTTTT") obj = Dseq("AAAAAAgatc", "TTTTTTgatc") assert obj.t4() == obj.t4("at") == Dseq("AAAAAA") assert obj.t4("atc") == obj.t4("atg") == obj.t4("atcg") == Dseq("AAAAAA") assert Dseq("GGATCC", "GGATCC").t4() == Dseq("GGATCC", "GGATCC") assert Dseq("GGATCCa", "GGATCC").t4() == Dseq("GGATCC", "GGATCC") assert Dseq("aGGATCC", "GGATCC").t4() == Dseq("aGGATCC", "GGATCCt") assert Dseq("aGGATCCa", "GGATCC").t4() == Dseq("aGGATCC", "GGATCCt") assert Dseq("GGATCC", "aGGATCC").t4() == Dseq("GGATCCt", "aGGATCC") assert Dseq("GGATCC", "GGATCCa").t4() == Dseq("GGATCC", "GGATCC") assert Dseq("GGATCC", "aGGATCCa").t4() == Dseq("GGATCCt", "aGGATCC") assert Dseq("GGATCC", "ATCC").t4("g") == Dseq("gg", "", ovhg=0) assert Dseq("GGATCC", "GGATCC").t4("gat") == Dseq("ggat", "ggat", ovhg=-2) a2 = Dseq("ccGGATCCaa", "ggatcc", -2) assert a2._data == "ccGGATCCaa" assert a2._data == "ccGGATCCaa" assert str(a2.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) ccGGATCCaa cctagg """ ).strip() assert repr(a2) == rpr a3 = Dseq("ccGGATCC", "ggatcc", -2) assert a3._data == "ccGGATCC" assert a3._data == "ccGGATCC" assert str(a3.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-8) ccGGATCC cctagg """ ).strip() assert repr(a3) == rpr b = Dseq("GGATCC", "aaggatcccc", 2) assert b._data == "ggGGATCCtt" assert b._data == "ggGGATCCtt" assert str(b.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) GGATCC cccctaggaa """ ).strip() assert repr(b) == rpr b2 = Dseq("GGATCCaa", "ggatcccc", 2) assert b2._data == "ggGGATCCaa" assert b2._data == "ggGGATCCaa" assert str(b2.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-10) GGATCCaa cccctagg """ ).strip() assert repr(b2) == rpr assert b2.seguid() == "hPNrcQ0sluXyfu4XuUh1trsnygc" b3 = Dseq("GGATCC", "ggatcccc", 2) assert b3._data == "ggGGATCC" assert b3._data == "ggGGATCC" assert str(b3.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-8) GGATCC cccctagg """ ).strip() assert repr(b3) == rpr c = Dseq("GGATCCaaa", "ggatcc", 0) assert c._data == "GGATCCaaa" assert c._data == "GGATCCaaa" assert str(c.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-9) GGATCCaaa cctagg """ ).strip() assert repr(c) == rpr d = Dseq("GGATCC", "aaaggatcc", 0) assert d._data == "GGATCCttt" assert d._data == "GGATCCttt" assert str(d.mung()) == "GGATCC" rpr = textwrap.dedent( """ Dseq(-9) GGATCC cctaggaaa """ ).strip() assert repr(d) == rpr obj = Dseq("GGATCCaaa", "ggatcc", 0) from Bio.Restriction import BamHI frag1 = Dseq("G", "gatcc", 0) frag2 = Dseq("GATCCaaa", "g", -4) assert obj.cut(BamHI) == (frag1, frag2) assert frag1 + frag2 == obj obj.seguid() == "HtK7-_BmOJw0BmtYE8f1yGdHc0c" assert frag1.seguid() == "yJkorWG5V2etvSLp6E6QNK-KMlQ" assert frag2.seguid() == "Aw3buI-N85OztBZAzeGJvXGlwO8" obj = Dseq("tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta") assert ( repr(obj) == "Dseq(-30)\ntagcgtagctgtagtatgtgatctggtcta\natcgcatcgacatcatacactagaccagat" ) obj2 = Dseq("tagcgtagctgtagtatgtgatctggtcta") obj3 = obj = Dseq( "tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta", 0 ) assert obj == obj2 == obj3 assert obj.find("ggatcc") == -1 assert obj.find("tgtagta") == 9 obj = Dseq("tagcgtagctgtagtatgtgatctggtctaa", "ttagaccagatcacatactacagctacgcta") obj = Dseq("tagcgtagctgtagtatgtgatctggtctaa", "CCCttagaccagatcacatactacagctacgcta") assert repr(obj) == "Dseq(-34)\ntagc..ctaa \natcg..gattCCC" obj = Dseq("tagcgtagctgtagtatgtgatctggtctaaCCC", "ttagaccagatcacatactacagctacgcta") assert repr(obj) == "Dseq(-34)\ntagc..ctaaCCC\natcg..gatt " obj = Dseq("agcgtagctgtagtatgtgatctggtctaa", "ttagaccagatcacatactacagctacgcta") assert repr(obj) == "Dseq(-31)\n agcg..ctaa\natcgc..gatt" obj = Dseq("Atagcgtagctgtagtatgtgatctggtctaa", "ttagaccagatcacatactacagctacgcta") assert repr(obj) == "Dseq(-32)\nAtagc..ctaa\n atcg..gatt" obj = Dseq( "tagcgtagctgtagtatgtgatctggtctaa", "tatcgcatcgacatcatacactagaccagatt"[::-1] ) assert repr(obj) == "Dseq(-32)\n tagc..ctaa\ntatcg..gatt" assert round(obj.mw(), 1) == 19535.6 obj1 = Dseq( "tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta", circular=True, linear=False, ) obj2 = Dseq( "tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta", circular=True, ) obj3 = Dseq( "tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta", linear=False ) assert obj1 == obj2 == obj3 assert obj1.find("ggatcc") == -1 assert obj1.find("tgtagta") == 9 assert ( Dseq( "tagcgtagctgtagtatgtgatctggtcta", "tagaccagatcacatactacagctacgcta" ).looped() == obj1 ) from Bio.Restriction import BglII, BamHI obj = Dseq("ggatcc") assert BglII in obj.no_cutters() assert BamHI not in obj.no_cutters() assert BamHI in obj.unique_cutters() assert BamHI in obj.once_cutters() assert BamHI in (obj + obj).twice_cutters() assert BamHI not in obj.twice_cutters() assert BamHI in obj.n_cutters(1) assert BamHI in obj.cutters() from Bio.Restriction import RestrictionBatch rb = RestrictionBatch((BamHI, BglII)) assert obj.cut(rb) == obj.cut(BamHI, BglII) == obj.cut(BglII, BamHI) obj = Dseq("ggatccAGATCT") assert obj.cut(rb) == obj.cut(BamHI, BglII) == obj.cut(BglII, BamHI) obj = Dseq("AGATCTggatcc") assert obj.cut(rb) == obj.cut(BamHI, BglII) == obj.cut(BglII, BamHI) obj = Dseq("ggatccAGATCT", circular=True) assert obj.cut(rb) == obj.cut(BamHI, BglII) != obj.cut(BglII, BamHI) obj = Dseq("AGATCTggatcc", circular=True) assert obj.cut(rb) == obj.cut(BglII, BamHI) != obj.cut(BamHI, BglII)
def get_restriction_table(seq, enzyme, circular=False): """ Get the restriction table for a single genomic sequence. Parameters ---------- seq : Seq object A biopython Seq object representing a chromosomes or contig. enzyme : int, str or list of str The name of the restriction enzyme used, or a list of restriction enzyme names. Can also be an integer, to digest by fixed chunk size. circular : bool Wether the genome is circular. Returns ------- numpy.array: List of restriction fragment boundary positions for the input sequence. >>> from Bio.Seq import Seq >>> get_restriction_table(Seq("AAGATCGATCGG"),"DpnII") array([ 0, 2, 6, 12]) >>> get_restriction_table(Seq("AA"),["DpnII", "HinfI"]) array([0, 2]) >>> get_restriction_table(Seq("AA"),"aeiou1") Traceback (most recent call last): ... ValueError: aeiou1 is not a valid restriction enzyme. >>> get_restriction_table("AA","DpnII") Traceback (most recent call last): ... TypeError: Expected Seq or MutableSeq instance, got <class 'str'> instead """ chrom_len = len(seq) wrong_enzyme = "{} is not a valid restriction enzyme.".format(enzyme) # Restriction batch containing the restriction enzyme try: enz = [enzyme] if isinstance(enzyme, str) else enzyme cutter = RestrictionBatch(enz) except (TypeError, ValueError): try: cutter = max(int(enzyme), DEFAULT_MIN_CHUNK_SIZE) except ValueError: raise ValueError(wrong_enzyme) # Conversion from string type to restriction type if isinstance(cutter, int): sites = [i for i in range(0, chrom_len, cutter)] if sites[-1] < chrom_len: sites.append(chrom_len) else: # Find sites of all restriction enzymes given ana = Analysis(cutter, seq, linear=not circular) sites = ana.full() # Gets all sites into a single flat list with 0-based index sites = [site - 1 for enz in sites.values() for site in enz] # Sort by position and allow first add start and end of seq sites.sort() sites.insert(0, 0) sites.append(chrom_len) return np.array(sites)
def _catalyze(record: SeqRecord, enzymes: List[RestrictionType], linear=True) -> List[Tuple[str, SeqRecord, str]]: """Catalyze a SeqRecord and return all post-digest SeqRecords with overhangs. Overhangs are returned as the overhang plus the position of the cut in the 5' end (^) and 3' end (_). So a 5' overhang may be: ^AAAA_. But a 3' overhang may be: _AAAA^. Args: record: The SeqRecord to digest with enzymes enzymes: List of enzymes to digest the input records with Keyword Args: linear: Whether the record to catalyze is linear or circular Returns: Tuple with: (left overhang, cut fragment, right overhang) """ record = record.upper() batch = RestrictionBatch(enzymes) batch_sites = batch.search(record.seq, linear=linear) # order all cuts with enzymes based on index cuts_seen: Set[int] = set() enzyme_cuts: List[Tuple[RestrictionType, int]] = [] for enzyme, cuts in batch_sites.items(): for cut in cuts: if cut in cuts_seen: continue cuts_seen.add(cut) enzyme_cuts.append((enzyme, cut - 1)) # revert to 0-based enzyme_cuts = sorted(enzyme_cuts, key=lambda x: x[1]) # list of left/right overhangs for each fragment frag_w_overhangs: List[Tuple[str, SeqRecord, str]] = [] for i, (enzyme, cut) in enumerate(enzyme_cuts): if i == len(enzyme_cuts) - 1 and linear: continue next_enzyme, next_cut = enzyme_cuts[(i + 1) % len(enzyme_cuts)] enzyme_len = len(enzyme.ovhgseq) next_enzyme_len = len(next_enzyme.ovhgseq) # shift cuts left for 3overhang enzymes if enzyme.is_3overhang(): cut -= enzyme_len if next_enzyme.is_3overhang(): next_cut -= next_enzyme_len cut_rc = cut if enzyme.is_3overhang() else cut + enzyme_len next_cut_rc = (next_cut if next_enzyme.is_3overhang() else next_cut + next_enzyme_len) # find the cutsite sequences left = record[cut:cut + enzyme_len] right = record[next_cut:next_cut + next_enzyme_len] left_rc = right.reverse_complement() right_rc = left.reverse_complement() left = str(left.seq) right = str(right.seq) left_rc = str(left_rc.seq) right_rc = str(right_rc.seq) if enzyme.is_3overhang(): left += "^" right_rc += "^" else: left = "^" + left right_rc = "^" + right_rc if next_enzyme.is_3overhang(): right += "^" left_rc += "^" else: right = "^" + right left_rc = "^" + left_rc # shift cuts right again for 3overhang enzymes if enzyme.is_3overhang(): cut += enzyme_len if next_enzyme.is_3overhang(): next_cut += next_enzyme_len frag = record[cut:next_cut] frag_rc = record[cut_rc:next_cut_rc].reverse_complement() frag_rc.id = record.id if next_cut < cut: # wraps around the zero-index frag = (record + record)[cut:next_cut + len(record)] frag.id = record.id frag_rc = (record + record)[cut_rc:next_cut_rc + len(record)].reverse_complement() frag_rc.id = record.id frag_w_overhangs.append((left, frag, right)) frag_w_overhangs.append((left_rc, frag_rc, right_rc)) return frag_w_overhangs
def find_spacers( itemlist: pyfaidx.Fasta, nuclease_info: dict, restriction_sites: Optional[List[str]] = None, chunks: int = 8, ) -> pd.DataFrame: """Find protospacers in a sequence for a given nuclease. The search region can be more expansive than just the PAM (for scoring purposes) and strand can be taken into account. `find_spacers()` will ignore sequences that have a poly(T) sequence, high GC content, or a motif matching given restriction nuclease sequence. Parameters ---------- itemlist : :class:`~pyfaidx.Fasta` Parsed FASTA with sequences to examine for spacers nuclease_info : dict Information for the nuclease to use. Required keys include `start`, `end`, `strand`, `pam`, `spacer_regex` restriction_sites : `List[str]`, optional (default: `None`) For a found spacer, ignore it if it contains the sequence for recognition by these restriction endonucleases. chunks : `int`, optional (default: 8) Number of pieces to divide the spacer dataframe into. Higher number means less memory used at a time, but may result in slower processing Return ------ :class:`~pandas.DataFrame` """ spacer_regex = regex.compile(nuclease_info["spacer_regex"]) spacer_start: int = nuclease_info["start"] spacer_end: int = nuclease_info["end"] # Set the restriction sites that we are going to make sure are not in our # spacers if restriction_sites: rsb = RestrictionBatch(restriction_sites) else: rsb = None # For each entry in the file (i.e. exonic sequence), find all of the # potential protospacer sequences. spacers_df = fasta_to_df(itemlist) tqdm.pandas(desc="finding forward spacers", unit="sequences") spacers_df["forward_spacers"] = spacers_df["sequence"].progress_apply( spacer_regex.findall ) tqdm.pandas(desc="finding reverse spacers", unit="sequences") spacers_df["reverse_spacers"] = spacers_df["reverse_complement"].progress_apply( spacer_regex.findall ) spacers_df = spacers_df.drop(columns=["sequence", "reverse_complement"]) chunked_spacer_dfs = np.array_split(spacers_df, chunks) pivot_partial = partial( pivot_spacers, spacer_start=spacer_start, spacer_end=spacer_end, restriction_sites=rsb, ) spacers_df = pd.concat(map(pivot_partial, chunked_spacer_dfs)) # duplicates were sneaking in. spacers_df = spacers_df.groupby("spacer").first().reset_index() return spacers_df
def restriction_supplier(seq, max_band=23130, min_band=2000, suppliers='ACEGFIHKJMONQPSRUVX', linear=False, p='yes'): """ Performs restriction endonuclease analysis by batch, based on supplier. Parameters: seq = DNA sequence for restriction endonuclease digestion max_band = size of maximum band in basepairs. Default = 23130 mmin_band = size of minimum band in basepairs. Default = 2000 suppliers = restriction enzyme supplier. Default = ACEGFIHKJMONQPSRUVX where A = Amersham Pharmacia Biotech C = Minotech Biotechnology E = Stratagene G = Qbiogene F = Fermentas AB I = SibEnzyme Ltd. H = American Allied Biochemical, Inc. K = Takara Shuzo Co. Ltd. J = Nippon Gene Co., Ltd. M = Roche Applied Science O = Toyobo Biochemicals N = New England Biolabs Q = CHIMERx P = Megabase Research Products S = Sigma Chemical Corporation R = Promega Corporation U = Bangalore Genei V = MRC-Holland X = EURx Ltd. linear = flag to define if DNA sequence is linear. Default = False (DNA is circular) p = flag to determine if the data is to be printed. Default = yes Returns: {Restriction endonuclease : (Total number of fragments after digestion, Number of fragments with molecular size above max_band, Number of fragments with molecular size between max_band and min_band, Number of fragments with molecular size below min_band)} """ from Bio.Restriction import RestrictionBatch count = 0 result = {} for enzyme in RestrictionBatch(first=[], suppliers=[x.upper() for x in suppliers]): try: digest = restriction_digest(seq, enzyme, max_band, min_band, linear, p) except MemoryError: print('Memory Error during ' + str(enzyme) + ' digestion') result[str(enzyme)] = (digest[0], len(digest[1]), len(digest[2]), len(digest[3])) count = count + 1 if p != 'yes': if count % 10 == 0: print(str(count) + ' restriction endonuclease processed') return result
def findRestrictionSites(sequence, restr_batch): mySeq = Seq(sequence, IUPACAmbiguousDNA()) rb = RestrictionBatch(restr_batch) analyze = Analysis(rb, mySeq) return analyze.full()
def analyze(name, name_bank, enzyme, genome_index, genome_fasta, genome_fastq, tag_length=6, looping=False, speed_looping=4, quality_min=30, tot_len_read=700, len_paired_wise_fastq=3, paired_wise_fastq=True, bowtie2=os.path.join(working_directory, "bowtie2"), bank_folder=os.path.join(toolbox_directory, "results"), ncpu=4): start_total = time.clock() hostname = socket.gethostname() print "Host name:", hostname ordi = hostname.split('.')[0] # if ordi == 'renoir': # folder_alignment_toolbox = '/Volumes/Data/HiC_project/alignment_toolbox' # bowtie2 = '/Volumes/Data/HiC_project/alignment_toolbox/bowtie2-2.0.0-beta5/' # out_foldR = '/Volumes/Data/hic_data/27_08_2013/results' # base_folder= '/Volumes/Data/hic_data/27_08_2013' # ncpu = 24 # else: folder_alignment_toolbox = toolbox_directory # bowtie2 = os.path.join(working_directory, "bowtie2") # out_foldR = os.path.join(toolbox_directory, "results") base_folder = working_directory # ncpu = 4 out_foldR = bank_folder if not (os.path.exists(out_foldR)): os.mkdir(out_foldR) ########################## PARAMETERS ################################################################################# # name = 'Vibrio_WT' # name_bank = 'Vibrio_209' folder_a = os.path.join(base_folder, '') folder_b = os.path.join(base_folder, '') if enzyme in AllEnzymes: restriction_site = RestrictionBatch([enzyme]).get(enzyme).site # HpaII else: restriction_site = enzyme #folder_alignment_toolbox + # genome_index = os.path.join(working_directory, "index/"+name) # genome_fasta = os.path.join(working_directory, "fasta/"+name+".fa") # tag_length = 6 # looping = False # speed_looping = 4 # quality_min = 30 print name_bank print out_foldR output_folder = os.path.join(out_foldR, bank_folder) # paired_wise_fastq = True # len_paired_wise_fastq = 3 # tot_len_read = 700 print genome_fastq print genome_fastq.split(',') if len(genome_fastq.split(',')) <= 1: motif_read_1 = genome_fastq print "Reading " + motif_read_1 if motif_read_1[-1] == '2': motif_read_2 = motif_read_1[:-1] + '1' print "Reading " + motif_read_2 elif motif_read_1[-1] == '1': motif_read_2 = motif_read_1[:-1] + '2' print "Reading " + motif_read_2 else: motif_read_2 = "" print "Warning: no second fastq file found" else: motif_read_1 = genome_fastq.split(',')[0] motif_read_2 = genome_fastq.split(',')[1] print motif_read_1 print motif_read_2 ####################################################################################################################### hic_bank = hic_exp.hic_exp(name_bank, tot_len_read, folder_a, folder_b, motif_read_1, motif_read_2, paired_wise_fastq, restriction_site, ncpu, tag_length, genome_index, genome_fasta, bowtie2, looping, quality_min, output_folder, speed_looping, len_paired_wise_fastq) start = time.clock() hic_bank.align() hic_bank.pcr_free() hic_bank.paired_reads_2_fragments() elapsed = (time.clock() - start) print 'Paired reads aligned in ' + str(elapsed) + ' s' print " start computing biases..." hic_bank.gc_size_bias() print " done." print "writing abs weighted contacts" hic_bank.fragments_contacts_2_weighted_contacts() elapsed = (time.clock() - start_total) print "all done in " + str(elapsed) + " s" print "ready for computation" return True
def __init__(self, enzymes, sequence, is_linear=True): self.enzymes = enzymes self.sequence = sequence self.res_batch = RestrictionBatch(enzymes) self.is_linear = is_linear self.site_dict = self.res_batch.search(self.sequence.seq, is_linear)
class DigestedSequence: def __init__(self, enzymes, sequence, is_linear=True): self.enzymes = enzymes self.sequence = sequence self.res_batch = RestrictionBatch(enzymes) self.is_linear = is_linear self.site_dict = self.res_batch.search(self.sequence.seq, is_linear) def get_sites(self): """ Return the set of sites for a given contig, ordered by increasing position. :return: list of CutSites """ cutSites = [] for e_name, ctg_locs in self.site_dict.iteritems(): for loc in ctg_locs: cutSites.append(CutSite(e_name, loc)) return sorted(cutSites) def get_fragments(self): """ Return the genomic fragments resulting from the digestion. :return: list of SeqRecords. """ sites = self.get_sites() seq = self.sequence if len(sites) == 0: return seq frags = [] for idx in xrange(1, len(sites)): a = sites[idx - 1] b = sites[idx] frg = seq[a:b] frg.id = "{0}:{1}:{2}".format(frg.id, a, b) frg.name = frg.id frg.description = "restriction digest fragment from {0} to {1}".format(a, b) frags.append(frg) return frags @staticmethod def digestion_sites(seq_list, enzyme_names=[], min_sites=1): """ Return a list of sites per sequence, preserving the input list order. :param seq_list: list of sequences to analyze :param enzyme_names: enzyme used in digestion :param min_sites: minimum sites required for a sequence to be included. :param min_length: minimum sequence length to be included. :return: list of sites per sequence """ sites = [] for seq in seq_list: if seq["excluded"]: continue ds = DigestedSequence(enzyme_names, seq["record"]) seq_sites = ds.get_sites() if len(seq_sites) < min_sites: print "\tExcluded {0} (length {1}) with only {2} sites".format( seq["record"].id, len(seq["record"]), len(seq_sites) ) # continue sites.append({"name": seq["record"].id, "pos": seq_sites}) return sites
def find_spacers(target=None, outfile=None, refgenome=None, restriction_sites=[], largeIndex=False, cutoff=0, offtargetcutoff=0, trim=False, logging=False, nuclease='Cas9', return_limit=9, reject=False): with open(target, 'rU') as infile: # Use our modified FastaIterator instead of, perhaps the more proper, SeqIO.parse() method # allows us to trim the UTR sequences off if trim: # If the 'trim' option is enabled, the header for each entry must be # GENEID | TRANSCRIPTID | EXON RANK | CONSTITUTIVE EXON | 5' UTR END | 3' UTR STOP | EXON START | EXON END try: itemiter = m_FastaIterator(infile) except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) else: try: itemiter = SeqIO.parse(infile, 'fasta') except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) itemlist = [temp for temp in itemiter] spacerlist = [] # This will find our 20N-NGG target sequence plus the -4->-3 and +1->+3 nucleotides for scoring if nuclease == 'Cas9': PAM = r'(?i)[ACGT]{25}[G]{2}[ACGT]{3}' elif nuclease == 'Cpf1': PAM = r'(?i)[T]{2,}[A-Z]{25}' rsb = RestrictionBatch(restriction_sites) #spacerlist = map(lambda item: find_each_spacer(item, rsb, cutoff, PAM), itemlist) seen = [] print("{} sequences to search for spacers.".format(len(itemlist))) widgets = ['Examining sequence: ', progressbar.Counter(), ' ', progressbar.Percentage(), ' ', progressbar.Bar(), progressbar.Timer()] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(itemlist)).start() spacer_re = regex.compile(PAM) #pol3stop = regex.compile(r'(?i)[T]{4,}') for item in itemlist: # Find all of the potential protospacer sequences, i.e. any 21 nucleotide sequence that precedes a double g progress.update(itemlist.index(item)+1) spacerMatch = (spacer_re.findall(str(item.seq), overlapped=True) + spacer_re.findall(str(item.reverse_complement().seq), overlapped=True)) for ps in spacerMatch: # Note that ps[4:24] is the actual protospacer. I need the rest of the sequence for scoring ps_seq = Seq(ps[4:24], IUPAC.unambiguous_dna) rs = rsb.search(ps_seq) # on_target_score_calculator only works if the sequence is in uppercase # otherwise, it returns a value of 0.193313360829 for some reason score = calc_score(ps.upper()) # Get rid of anything with T(4+) as those act as RNAPIII terminators #if bool(pol3stop.findall(ps[4:24])): if "TTTT" in ps[4:24]: # TODO Should this also eliminate anything with G(4)? pass # Get rid of anything that has the verboten restriction sites elif bool([y for y in rs.values() if y != []]): pass # Eliminate potentials with a GC content <20 or >80% elif GC(ps_seq) <= 20 or GC(ps_seq) >= 80: pass elif float(score) < cutoff: # explicitly converting cutoff to a float because otherwise it 'sometimes' fails # (epecially if you set cutoff to 0.5 on the commandline) pass # keep everthing else else: if ps[4:24] not in seen: position = int(str(item.seq).find(ps)) + int(item.description.split("|")[7]) keys = ['description','position','score','spacer','offtargetscore','name'] values = [item.description, int(position), score, ps[4:24], 100] # because of duplicated sequences or whatever, spacelist can end up with duplicate entries # so let's take care of them # For future! #spacer = ProtoSpacer(description=item.description, position=int(position), score=score, spacerseq=ps[4:24], offtargetscore=100) #if ps[4:24] not in seen: # spacerset.update(spacer) # seen.append(ps[4:24]) spacerlist.append(dict(zip(keys,values))) seen.append(ps[4:24]) progress.finish() if len(spacerlist) == 0: print("Sorry, no spacers matching that criteria were found") return 0 else: print("Finished finding spacers. {} spacers found. Begining Bowtie alignment...".format(len(spacerlist))) # write out a file to pass off to Bowtie to use for off-target analysis with open('temp.fa', 'w') as tempfile: for entry in spacerlist: tempfile.writelines(">%s %s %s\n%s\n" % (entry['description'], entry['position'], entry['score'], entry['spacer'])) # delete lists we are not going to use in the future to save on some memory del(itemlist) del(seen) # Use Bowtie to find if this particular sequence has any potential off targets (i. e. two or fewer mismatches) # As current, Bowtie is set to return all everything that a particular spacer matches within the reference genome # with two or fewer mismatches. # TODO switch to Bowtie2 so we can account for gaps in mismatches # TODO can we modify this setup to account for NAG PAMs or do we even need to? (i. e. are we already?) program = 'bowtie' cpus = "-p" + str(cpu_count()) # maybe tell bowtie to stop looking after a set number of matches if reject: if largeIndex: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, '--large-index', refgenome, '-f', 'temp.fa', 'offtargets.fa']) else: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, refgenome, '-f', 'temp.fa', 'offtargets.fa']) else: if largeIndex: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '--large-index', refgenome, '-f', 'temp.fa', 'offtargets.fa']) else: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, refgenome, '-f', 'temp.fa', 'offtargets.fa']) print("Bowtie finished. Begining offtarget analysis...") bowtie_results_file = 'offtargets.fa' oftcount = 1 # This count is slow and probably unnecessary total_lines = sum(1 for line in open(bowtie_results_file)) print("Total alignments from Bowtie: {}".format(total_lines)) new_widgets = ['Scoring for off-targets. Examining: ', progressbar.Counter(), ' ', progressbar.Percentage(), ' ', progressbar.Bar(), progressbar.Timer(), progressbar.ETA()] new_progress = progressbar.ProgressBar(widgets=new_widgets, maxval=total_lines).start() prunedlist = [] # List to hold spacers whose off-target score is above the minimum cutoff mmpos = '[0-9]{1,}' mmpos_re = regex.compile(mmpos) with open(bowtie_results_file) as offtargetsfile: # parse all the bowtie results into something we can use # Read in the first line and parse. keys = ['readname', 'strand', 'position', 'mmpositions'] values = offtargetsfile.readline().strip('\n').split('\t') previous_entry = dict(zip(keys,values)) current_set_of_offtargets = [] # Read in the next line in the list. for line in offtargetsfile: line_values = line.strip('\n').split('\t') next_entry = dict(zip(keys,line_values)) new_progress.update(oftcount) oftcount += 1 if next_entry['readname'] == previous_entry['readname']:# Check to see if it is for the same spacer current_set_of_offtargets.append(next_entry) # group them together. else: # If it isn't, it is time to score that set and move to the next # Extract the off-target positions from each entry mmlist = [] badcount = 0 for entry in current_set_of_offtargets: pos = mmpos_re.findall(entry['mmpositions']) if len(pos) == 0: # Bowtie returns a match for the spacer itself in the genome # if we have two such matches, we should indicate that there is a perfect off-target badcount += 1 elif entry['strand'] == '+': mmlist.append([int(w) for w in pos]) elif entry['strand'] == '-': mmlist.append([19-int(w) for w in pos]) # Find the spacer in spacerlist to which these off-targets belong and set that spacer's offtarget score matching_spacer = next((spacer for spacer in spacerlist if (previous_entry['readname'] == '{} {} {}'.format(spacer['description'], spacer['position'],str(spacer['score'])))), None) # Tally up the offtarget score for the set of off targets and set the spacer's off-target score if badcount > 2: matching_spacer['offtargetscore'] = 0 # Speed this up by rejecting things with over a certain set of matching off-targets elif reject and len(mmlist) > reject: matching_spacer['offtargetscore'] = 0 else: matching_spacer['offtargetscore'] = sumofftargets(mmlist) if float(matching_spacer['offtargetscore']) >= float(offtargetcutoff): prunedlist.append(matching_spacer) # Dump the previous set. current_set_of_offtargets = [] # Start a new set of off-targets using this new line previous_entry = next_entry current_set_of_offtargets.append(next_entry) new_progress.finish() print("\nFinished scoring off-targets") finallist = [] if len(prunedlist) == 0: print("No spacers were found that correspond to the parameters you set.") else: if len(prunedlist[0]['description'].split('|')) == 9: # need to make sure the header format is correct for entry in prunedlist: finallist.append(FormattedResult(entry)) # Create a set of all the GeneIDs in our list geneset = set([y.GeneID for y in finallist]) toplist = [] for z in geneset: all_spacers_for_gene = [a for a in finallist if a.GeneID == z] ranked_spacers = sorted(all_spacers_for_gene, key=attrgetter('score','offtargetscore'), reverse=True) if return_limit == 'all': for w in ranked_spacers: toplist.append(w) elif len(ranked_spacers) > int(return_limit): for w in range(0,int(return_limit)): toplist.append(ranked_spacers[w]) else: for w in range(0,len(ranked_spacers)-1): toplist.append(ranked_spacers[w]) olist = map(lambda entry: [entry.GeneID, entry.GeneName, entry.seq, entry.GC, entry.position, entry.score, entry.offtargetscore], toplist) headerlist = ['GeneID', 'GeneName', 'seq','%GC','position', 'score', 'off-target score'] else: # if it isn't, just dump all the spacers we found into a file finallist = spacerlist olist = map(lambda entry: [entry.id.split(' ')[0], entry.seq, GC(entry.seq), entry.position.split(' ')[1], entry.score.split(' ')[2]], finallist) headerlist = ['ID', 'seq', '%GC','position', 'score'] print("Writing file.") try: with open(outfile, 'w') as ofile: output = csv.writer(ofile, dialect='excel') output.writerows([headerlist]) output.writerows(olist) except IOError: print("There is trouble writing to the file. Perhaps it is open in another application?") choice = input("Would you like to try again? [y/n]") if choice == 'y' or choice == 'Y': try: with open(outfile, 'w') as ofile: output = csv.writer(ofile, dialect='excel') output.writerows([headerlist]) output.writerows(olist) except: print("Sorry, still was unable to write to the file") else: return 0 print("Finished.")
def gen_enzyme_religation_regex(enzyme): """Return a regex which corresponds to all possible religation sites given a set of enzyme. Parameters: ----------- enzyme : str String that contains the names of the enzyme separated by a comma. Returns: -------- re.Pattern : Regex that corresponds to all possible ligation sites given a set of enzyme. Examples: --------- >>> gen_enzyme_religation_regex('HpaII') re.compile('CCGCGG') >>> gen_enzyme_religation_regex('HpaII,MluCI') re.compile('AATTAATT|AATTCGG|CCGAATT|CCGCGG') """ # Split the str on the comma to separate the different enzymes. enzyme = enzyme.split(",") # Check on Biopython dictionnary the enzyme. rb = RestrictionBatch(enzyme) # Initiation: give_list = [] accept_list = [] ligation_list = [] # Iterates on the enzymes. for enz in rb: # Extract restriction sites and look for cut sites. site = enz.elucidate() fw_cut = site.find("^") rev_cut = site.find("_") # Process "give" site. Remove N on the left (useless). give_site = site[:rev_cut].replace("^", "") while give_site[0] == "N": give_site = give_site[1:] give_list.append(give_site) # Process "accept" site. Remove N on the rigth (useless). accept_site = site[fw_cut + 1:].replace("_", "") while accept_site[-1] == "N": accept_site = accept_site[:-1] accept_list.append(accept_site) # Iterates on the two list to build all the possible HiC ligation sites. for give_site in give_list: for accept_site in accept_list: # Replace "N" by "." for regex searching of the sites ligation_list.append((give_site + accept_site).replace("N", ".")) ligation_list.append( str(Seq(give_site + accept_site).reverse_complement()).replace( "N", ".")) # Build the regex for any ligation sites. pattern = "|".join(sorted(list(set(ligation_list)))) return re.compile(pattern)
class DigestedSequence: def __init__(self, enzymes, sequence, is_linear=True): self.enzymes = enzymes self.sequence = sequence self.res_batch = RestrictionBatch(enzymes) self.is_linear = is_linear self.site_dict = self.res_batch.search(self.sequence.seq, is_linear) def get_sites(self): """ Return the set of sites for a given contig, ordered by increasing position. :return: list of CutSites """ cutSites = [] for e_name, ctg_locs in self.site_dict.iteritems(): for loc in ctg_locs: cutSites.append(CutSite(e_name, loc)) return sorted(cutSites) def get_fragments(self): """ Return the genomic fragments resulting from the digestion. :return: list of SeqRecords. """ sites = self.get_sites() seq = self.sequence if len(sites) == 0: return seq frags = [] for idx in xrange(1, len(sites)): a = sites[idx - 1] b = sites[idx] frg = seq[a:b] frg.id = '{0}:{1}:{2}'.format(frg.id, a, b) frg.name = frg.id frg.description = 'restriction digest fragment from {0} to {1}'.format( a, b) frags.append(frg) return frags @staticmethod def digestion_sites(seq_list, enzyme_names=[], min_sites=1): """ Return a list of sites per sequence, preserving the input list order. :param seq_list: list of sequences to analyze :param enzyme_names: enzyme used in digestion :param min_sites: minimum sites required for a sequence to be included. :param min_length: minimum sequence length to be included. :return: list of sites per sequence """ sites = [] for seq in seq_list: if seq['excluded']: continue ds = DigestedSequence(enzyme_names, seq['record']) seq_sites = ds.get_sites() if len(seq_sites) < min_sites: print '\tExcluded {0} (length {1}) with only {2} sites'.format( seq['record'].id, len(seq['record']), len(seq_sites)) #continue sites.append({'name': seq['record'].id, 'pos': seq_sites}) return sites
def write_frag_info( fasta, enzyme, size=DEFAULT_THRESHOLD_SIZE, circular=False, output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME, output_frags=DEFAULT_FRAGMENTS_LIST_FILE_NAME, output_dir=None, ): """Write the fragments_list.txt and info_contigs.txt that are necessary for GRAAL to run """ try: my_enzyme = RestrictionBatch([enzyme]).get(enzyme) except ValueError: my_enzyme = max(int(enzyme), DEFAULT_MIN_CHUNK_SIZE) records = SeqIO.parse(fasta, "fasta") try: info_contigs_path = os.path.join(output_dir, output_contigs) frag_list_path = os.path.join(output_dir, output_frags) except AttributeError: info_contigs_path = output_contigs frag_list_path = output_frags with open(info_contigs_path, "w") as info_contigs: info_contigs.write("contig\tlength_kb\tn_frags\tcumul_length\n") with open(frag_list_path, "w") as fragments_list: fragments_list.write("id\tchrom\tstart_pos" "\tend_pos\tsize\tgc_content\n") total_frags = 0 for record in records: my_seq = record.seq contig_name = record.id contig_length = len(my_seq) if contig_length < int(size): continue try: my_frags = my_enzyme.catalyze(my_seq, linear=not circular) except AttributeError: n = len(my_seq) my_frags = (my_seq[i:min(i + my_enzyme, n)] for i in range(0, len(my_seq), my_enzyme)) n_frags = 0 current_id = 1 start_pos = 0 for frag in my_frags: size = len(frag) if size > 0: end_pos = start_pos + size gc_content = SeqUtils.GC(frag) / 100.0 current_fragment_line = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( current_id, contig_name, start_pos, end_pos, size, gc_content, ) fragments_list.write(current_fragment_line) try: assert (current_id == 1 and start_pos == 0) or (current_id > 1 and start_pos > 0) except AssertionError: print((current_id, start_pos)) raise start_pos = end_pos current_id += 1 n_frags += 1 current_contig_line = "%s\t%s\t%s\t%s\n" % ( contig_name, contig_length, n_frags, total_frags, ) total_frags += n_frags info_contigs.write(current_contig_line)
# # Created: 11/04/2013 #------------------------------------------------------------------------------- from Bio import Entrez from Bio import SeqIO from Bio.Restriction import Restriction from Bio.Restriction import RestrictionBatch email="*****@*****.**" Entrez.email = email fetch_seq = Entrez.efetch(db="nucleotide", rettype="fasta",retmode="text", id="294489415") seq_record = SeqIO.read(fetch_seq, "fasta") fetch_seq.close() # To see how a specific restriction enzyme (Sau3AI) would digest your sequence: print("Restriction site is", Restriction.Sau3AI.site) digest = Restriction.Sau3AI.catalyse(seq_record.seq) print ("Number of fragments is", len(digest)) print "------\nLengths of each fragment\n------" for lengths in digest: print len(lengths) # Run every restriction enzyme in the New England Biolabs database against the sequence rb_supp = RestrictionBatch(first=[], suppliers=['N']) for rest in rb_supp.search(seq_record.seq): # The code commented out below will only show restriction enzymes that created a number of fragments between 10 and 40 #if len(rb_supp.search(seq_record.seq)[rest]) > 10 and len(rb_supp.search(seq_record.seq)[rest]) < 40: # This will show every restriction ezyme that was able to digest the sequence in some way. if len(rb_supp.search(seq_record.seq)[rest]) > 0: print rest, ":", rb_supp.search(seq_record.seq)[rest]
def main(protein_fasta_open_file, list_codon_usage_open_files, output_destination, restriction_enzymes=""): # parse protein record = Parser.parse_fasta_file(protein_fasta_open_file) name, id, sequence = record.name, record.id, record.seq creatures = {} # parse table if len(list_codon_usage_open_files) == 0: raise Exception("Error: Empty codon table filnames") # parses organism files , assuming they are already open for fname, open_file in list_codon_usage_open_files: creature_name = fname.split('.')[0] codon_usage_dict, codon_to_protein_dict, AA_list = Parser.parse_kazusa_codon_usage_table(open_file) creatures[creature_name] = codon_usage_dict, codon_to_protein_dict, AA_list # creates AA Amino_Acids_obj_list = [] AA_LIST = creatures[creature_name][2] codon_to_protein_dict = creatures[creature_name][1] for aa in AA_LIST: AA = AminoAcid.AminoAcid(aa, codon_to_protein_dict) Amino_Acids_obj_list.append(AA) for creature_name, creature_tuple in creatures.items(): codon_usage_dict, codon_to_protein_dict, AA_list = creature_tuple for AA in Amino_Acids_obj_list: AA.add_organism_codons(codon_usage_dict, creature_name) prot_analisys = ProtParam.ProteinAnalysis(sequence._data) aa_count_dict = prot_analisys.count_amino_acids() # replaces aa with codons from codon pool ouput_protein_list = Calculator.compute_and_Switch(Amino_Acids_obj_list, sequence, aa_count_dict) final_sequence = "".join(ouput_protein_list) final_sequence = final_sequence.replace("U", "T") # analyse final sequance if len(final_sequence) != len(sequence) * 3: raise Exception("final sequance length does not match input sequence length") # output_file_name = os.path.join(output_destination, "Ouput.fasta") record = SeqRecord.SeqRecord(Seq(final_sequence, ), name=name) if record.translate().seq != sequence: raise Exception("error- resulting DNA does not translate back to protein") # restriction enzymes- verifies they do not cut the sequence. if they do, pick the least cut sequence if restriction_enzymes != "": restriction_enzymes_list = restriction_enzymes.replace(",", " ").replace('\n', ' ').replace("\t", " ").split() batch = RestrictionBatch(restriction_enzymes_list) num_cutting = len(check_restriction(Seq(final_sequence, generic_dna), batch)) best_num_cutting = np.inf best_sequ = final_sequence iterations = 100 no_enzymes_cut = num_cutting == 0 # if the original sequence had a restriction site, repeat the sequence building 100 times , or until # a non- cut sequence is found while iterations > 0 and num_cutting > 0: ouput_protein_list = Calculator.compute_and_Switch(Amino_Acids_obj_list, sequence, aa_count_dict) final_sequence = "".join(ouput_protein_list) final_sequence = final_sequence.replace("U", "T") # analyse final sequance if len(final_sequence) != len(sequence) * 3: raise Exception("final sequance length does not match input sequence length") # output_file_name = os.path.join(output_destination, "Ouput.fasta") record = SeqRecord.SeqRecord(Seq(final_sequence, generic_dna), name=name) if record.translate().seq != sequence: print("error- resulting DNA does not translate back to protein") exit(1) # if achieved non cutting sequence, save and return num_cutting = len(check_restriction(Seq(final_sequence, generic_dna), batch)) if num_cutting == 0: check_restriction(Seq(final_sequence, generic_dna), batch, to_print=True) print("printing to output file....") SeqIO.write(record, output_destination, "fasta") print("ouput sucsessful") return "Output Sucsessful" best_num_cutting = min(best_num_cutting, num_cutting) if best_num_cutting == num_cutting: best_sequ = final_sequence iterations -= 1 # return best sequence, as in one that is cut by the least amount of restriction enzymes if best_num_cutting > 0: cutting = check_restriction(Seq(best_sequ, generic_dna), batch, to_print=True) record = SeqRecord.SeqRecord(Seq(best_sequ, generic_dna), name=name) SeqIO.write(record, output_destination, "fasta") return "The enzymes the cut the sequence are:" + str(cutting) + "\n Output printed to specified location." SeqIO.write(record, output_destination, "fasta") return "ouput sucsessful"
def test_creating_batch(self): """Creating and modifying a restriction batch.""" batch = RestrictionBatch([EcoRI]) batch.add(KpnI) batch += EcoRV self.assertEqual(len(batch), 3) # The usual way to test batch membership self.assertIn(EcoRV, batch) self.assertIn(EcoRI, batch) self.assertIn(KpnI, batch) self.assertNotIn(SmaI, batch) # Syntax sugar for the above self.assertIn('EcoRV', batch) self.assertNotIn('SmaI', batch) batch.get(EcoRV) self.assertRaises(ValueError, batch.get, SmaI) batch.remove(EcoRV) self.assertEqual(len(batch), 2) self.assertNotIn(EcoRV, batch) self.assertNotIn('EcoRV', batch) # Create a batch with suppliers and other supplier related methods # These tests may be 'update sensitive' since company names and # products may change often... batch = RestrictionBatch((), ('S')) # Sigma self.assertEqual(batch.current_suppliers(), ['Sigma Chemical Corporation']) self.assertIn(EcoRI, batch) self.assertNotIn(AanI, batch) batch.add_supplier('B') # Life Technologies self.assertIn(AanI, batch)