def from_SeqRecord(cls, record: _SeqRecord, *args, linear=True, circular=False, n=5e-14, **kwargs): obj = cls.__new__(cls) # Does not call __init__ obj._seq = _Dseq.quick( str(record.seq), _rc(str(record.seq)), ovhg=0, linear=linear, circular=circular, ) obj.id = record.id obj.name = record.name obj.description = record.description obj.dbxrefs = record.dbxrefs obj.annotations = {"molecule_type": "DNA"} obj.annotations.update(record.annotations) obj._per_letter_annotations = record._per_letter_annotations obj.features = record.features obj.map_target = None obj.n = n return obj
def from_string(cls, record: str = "", *args, linear=True, circular=False, n=5e-14, **kwargs): # def from_string(cls, record:str="", *args, linear=True, circular=False, n = 5E-14, **kwargs): obj = cls.__new__(cls) # Does not call __init__ obj._seq = _Dseq.quick(record, _rc(record), ovhg=0, linear=linear, circular=circular) obj.id = _pretty_str("id") obj.name = _pretty_str("name") obj.description = _pretty_str("description") obj.dbxrefs = [] obj.annotations = {"molecule_type": "DNA"} obj._per_letter_annotations = {} obj.features = [] obj.map_target = None obj.n = n obj.__dict__.update(kwargs) return obj
def _fill_in_five_prime(self, nucleotides): stuffer = "" type, se = self.five_prime_end() if type == "5'": for n in _rc(se): if n in nucleotides: stuffer += n else: break return self.crick + stuffer, self._ovhg + len(stuffer)
def _fill_in_three_prime(self, nucleotides): stuffer = "" type, se = self.three_prime_end() if type == "5'": for n in _rc(se): if n in nucleotides: stuffer += n else: break return self.watson + stuffer
def looped(self): """Returns a circularized Dseq object. This can only be done if the two ends are compatible, otherwise a TypeError is raised. Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("catcgatc") >>> a Dseq(-8) catcgatc gtagctag >>> a.looped() Dseq(o8) catcgatc gtagctag >>> a.T4("t") Dseq(-8) catcgat tagctag >>> a.T4("t").looped() Dseq(o7) catcgat gtagcta >>> a.T4("a") Dseq(-8) catcga agctag >>> a.T4("a").looped() Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/local/lib/python2.7/dist-packages/pydna/dsdna.py", line 357, in looped if type5 == type3 and str(sticky5) == str(rc(sticky3)): TypeError: DNA cannot be circularized. 5' and 3' sticky ends not compatible! >>> """ if self.circular: return self type5, sticky5 = self.five_prime_end() type3, sticky3 = self.three_prime_end() if type5 == type3 and str(sticky5) == str(_rc(sticky3)): nseq = Dseq.quick( self.watson, self.crick[-self._ovhg:] + self.crick[:-self._ovhg], ovhg=0, linear=False, circular=True, ) assert len(nseq.crick) == len(nseq.watson) return nseq else: raise TypeError("DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!")
def assemble_circular(self): cps = {} # circular assembly cpsrc = {} cpaths = sorted( _nx.simple_cycles(self.G), key=len) cpaths_sorted=[] for cpath in cpaths: order, node = min((self.G.nodes[node]["order"],node) for node in cpath) i=cpath.index(node) cpaths_sorted.append((order, cpath[i:]+cpath[:i])) cpaths_sorted.sort() for _, cp in cpaths_sorted: # cpaths is a list of nodes representing a circular assembly edgelol = [] # edgelol is a list of lists of all edges along cp cp+= cp[0:1] for u,v in zip(cp, cp[1:]): e=[] for d in self.G[u][v].values(): e.append((u,v,d)) edgelol.append(e) for edges in _itertools.product(*edgelol): if [True for ((u,v,e),(x,y,z)) in zip(edges, edges[1:]) if (e["seq"],e["piece"].stop) == (z["seq"],z["piece"].start)]: continue ct = "".join(e["seq"][e["piece"]] for u,v,e in edges) key=ct.upper() if key in cps or key in cpsrc: continue # TODO: cpsrc not needed? sg=_nx.DiGraph() sg.add_edges_from(edges) sg.add_nodes_from( (n,d) for n,d in self.G.nodes(data=True) if n in cp ) edgefeatures=[] offset=0 for u,v,e in edges: feats = _deepcopy(e["features"]) for feat in feats: feat.location+=offset edgefeatures.extend(feats) offset+=e["piece"].stop-e["piece"].start for f in edgefeatures: if f.location.start>len(ct) and f.location.end>len(ct): f.location+=(-len(ct)) elif f.location.end>len(ct): f.location = _CompoundLocation((_FeatureLocation(f.location.start,_ExactPosition(len(ct))),_FeatureLocation(_ExactPosition(0), f.location.end-len(ct)))) cps[key] = cpsrc[_rc(key)] = ct, edgefeatures, sg, {n:self.nodemap[n] for n in cp[:-1]}, cp return sorted((_Contig.from_string(cp[0], features = cp[1], graph = cp[2], nodemap = cp[3], linear=False, circular=True) for cp in cps.values()), key=len, reverse=True)
def quick(cls, watson: str, crick: str, ovhg=0, linear=True, circular=False, pos=0): obj = cls.__new__(cls) # Does not call __init__ obj.watson = _pretty_str(watson) obj.crick = _pretty_str(crick) obj._ovhg = ovhg obj._circular = circular obj._linear = linear obj.length = max( len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg)) obj.pos = pos obj._data = (_rc(crick[-max(0, ovhg) or len(crick):]) + watson + _rc(crick[:max(0, len(crick) - ovhg - len(watson))])) # obj.alphabet = _generic_dna return obj
def from_string(cls, dna: str, *args, linear=True, circular=False, **kwargs): obj = cls.__new__(cls) # Does not call __init__ obj.watson = _pretty_str(dna) obj.crick = _pretty_str(_rc(dna)) obj._ovhg = 0 obj._circular = circular obj._linear = linear obj.length = len(dna) obj.pos = 0 obj._data = dna # obj.alphabet = _generic_dna return obj
def __add__(self, other): '''Simulates ligation between two DNA fragments. Add other Dseq object at the end of the sequence. Type error is raised if any of the points below are fulfilled: * one or more objects are circular * if three prime sticky end of self is not the same type (5' or 3') as the sticky end of other * three prime sticky end of self complementary with five prime sticky end of other. Phosphorylation and dephosphorylation is not considered. DNA is allways presumed to have the necessary 5' phospate group necessary for ligation. ''' # test for circular DNA if self.circular: raise TypeError("circular DNA cannot be ligated!") try: if other.circular: raise TypeError("circular DNA cannot be ligated!") except AttributeError: pass self_type, self_tail = self.three_prime_end() other_type, other_tail = other.five_prime_end() if (self_type == other_type and str(self_tail) == str(_rc(other_tail))): answer = Dseq.quick(self.watson + other.watson, other.crick + self.crick, self._ovhg) elif not self: answer = _copy.copy(other) elif not other: answer = _copy.copy(self) else: raise TypeError("sticky ends not compatible!") return answer
def reverse_complement(self): answer = type(self)(super().reverse_complement()) g = _nx.DiGraph() nm = self.nodemap g.add_edges_from([ (nm[v], nm[u], d) for u, v, d in list(self.graph.edges(data=True))[::-1] ]) g.add_nodes_from( (nm[n], d) for n, d in list(self.graph.nodes(data=True))[::-1]) for u, v, ed in g.edges(data=True): ed["name"] = (ed["name"][:-3] if ed["name"].endswith("_rc") else "{}_rc".format(ed["name"])[:13]) ed["seq"] = _rc(ed["seq"]) ln = len(ed["seq"]) start, stop = ed["piece"].start, ed["piece"].stop ed["piece"] = slice(ln - stop - g.nodes[u]["length"], ln - start - g.nodes[v]["length"]) ed["features"] = [f._flip(ln) for f in ed["features"]] answer.graph = g answer.nodemap = {v: k for k, v in self.nodemap.items()} return answer
def _annealing_positions(primer, template, limit=15): """Finds the annealing position(s) for a primer on a template where the primer anneals perfectly with at least limit nucleotides in the 3' part. The primer is the lower strand in the figure below. start is a position (integer) footprint and tail are strings. :: <- - - - - - - - - - template - - - - - - - - - - - - - > <------- start (int) ------> 5'-...gctactacacacgtactgactgcctccaagatagagtcagtaaccacactcgat...3' |||||||||||||||||||||||||||||||||||||||||||||||| 3'-gttctatctcagtcattggtgtATAGTG-5' <-footprint length --> Parameters ---------- primer : string The primer sequence 5'-3' template : string The template sequence 5'-3' limit : int = 15, optional footprint needs to be at least of length limit. Returns ------- describe : list of tuples (int, int) [ (start1, footprint1), (start2, footprint2) ,..., ] """ # return empty list if primer too short if len(primer) < limit: return [] prc = _rc(primer) # head is minimum part of primer that can anneal head = prc[:limit].upper() table = { "R": "(A|G)", "Y": "(C|T)", "S": "(G|C)", "W": "(A|T)", "K": "(G|T)", "M": "(A|C)", "B": "(C|G|T)", "D": "(A|G|T)", "H": "(A|C|T)", "V": "(A|C|G)", "N": "(A|G|C|T)", } # Make regex pattern that reflects extended IUPAC DNA code for key in table: head = head.replace(key, table[key]) positions = [ m.start() for m in _re.finditer("(?={})".format(head), template, _re.I) ] if positions: tail = prc[limit:] length = len(tail) results = [] for match_start in positions: tm = template[match_start + limit:match_start + limit + length] footprint = len( list( _itertools.takewhile( lambda x: x[0].lower() == x[1].lower(), zip(tail, tm)))) results.append((match_start, footprint + limit)) return results return []
def __init__(self, watson, crick=None, ovhg=None, linear=None, circular=None, pos=0): if crick is None: if ovhg is None: crick = _rc(watson) ovhg = 0 self._data = watson else: # ovhg given, but no crick strand raise ValueError("ovhg defined without crick strand!") else: # crick strand given if ovhg is None: # ovhg not given olaps = _common_sub_strings( str(watson).lower(), str(_rc(crick).lower()), int(_math.log(len(watson)) / _math.log(4)), ) try: F, T, L = olaps[0] except IndexError: raise ValueError( "Could not anneal the two strands. Please provide ovhg value" ) ovhgs = [ol[1] - ol[0] for ol in olaps if ol[2] == L] if len(ovhgs) > 1: raise ValueError( "More than one way of annealing the strands. Please provide ovhg value" ) ovhg = T - F sns = (ovhg * " ") + _pretty_str(watson) asn = (-ovhg * " ") + _pretty_str(_rc(crick)) self._data = "".join([ a.strip() or b.strip() for a, b in _itertools.zip_longest(sns, asn, fillvalue=" ") ]) else: # ovhg given if ovhg == 0: if len(watson) == len(crick): self._data = watson elif len(watson) > len(crick): self._data = watson else: self._data = watson + _rc( crick[:len(crick) - len(watson)]) elif ovhg > 0: if ovhg + len(watson) > len(crick): self._data = _rc(crick[-ovhg:]) + watson else: self._data = ( _rc(crick[-ovhg:]) + watson + _rc(crick[:len(crick) - ovhg - len(watson)])) else: # ovhg < 0 if -ovhg + len(crick) > len(watson): self._data = watson + _rc( crick[:-ovhg + len(crick) - len(watson)]) else: self._data = watson self._circular = (bool(circular) and bool(linear) ^ bool(circular) or linear == False and circular is None) self._linear = not self._circular self.watson = _pretty_str(watson) self.crick = _pretty_str(crick) # self.length = max(len(watson)+max(0,ovhg), len(crick)+max(0,-ovhg)) self.length = len(self._data) self._ovhg = ovhg self.pos = pos self._data = self._data