def looped(self): """ Returns a circular version of the Dseqrecord object. The underlying Dseq object has to have compatible ends. Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("aaa") >>> a Dseqrecord(-3) >>> b=a.looped() >>> b Dseqrecord(o3) >>> See also -------- pydna.dseq.Dseq.looped """ new = _copy.copy(self) for key, value in list(self.__dict__.items()): setattr(new, key, value) new._seq = self.seq.looped() five_prime = self.seq.five_prime_end() for fn, fo in zip(new.features, self.features): if five_prime[0] == "5'": fn.location = fn.location + self.seq.ovhg elif five_prime[0] == "3'": fn.location = fn.location + (-self.seq.ovhg) if fn.location.start < 0: loc1 = _FeatureLocation(len(new) + fn.location.start, len(new), strand=fn.strand) loc2 = _FeatureLocation(0, fn.location.end, strand=fn.strand) fn.location = _CompoundLocation([loc1, loc2]) if fn.location.end > len(new): loc1 = _FeatureLocation(fn.location.start, len(new), strand=fn.strand) loc2 = _FeatureLocation(0, fn.location.end - len(new), strand=fn.strand) fn.location = _CompoundLocation([loc1, loc2]) fn.qualifiers = fo.qualifiers return new
def lcs(self, other, *args, limit=25, **kwargs): """Return the longest common substring between the sequence. and another sequence (other). The other sequence can be a string, Seq, SeqRecord, Dseq or DseqRecord. The method returns a SeqFeature with type "read" as this method is mostly used to map sequence reads to the sequence. This can be changed by passing a type as keyword with some other string value. Examples -------- >>> from pydna.seqrecord import SeqRecord >>> a = SeqRecord("GGATCC") >>> a.lcs("GGATCC", limit=6) SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(6), strand=1), type='read') >>> a.lcs("GATC", limit=4) SeqFeature(FeatureLocation(ExactPosition(1), ExactPosition(5), strand=1), type='read') >>> a = SeqRecord("CCCCC") >>> a.lcs("GGATCC", limit=6) SeqFeature(None) """ # longest_common_substring # https://biopython.org/wiki/ABI_traces if hasattr(other, "seq"): r = other.seq if hasattr(r, "watson"): r = str(r.watson).lower() else: r = str(r).lower() else: r = str(other.lower()) olaps = _common_sub_strings(str(self.seq).lower(), r, limit=limit or 25) try: start_in_self, start_in_other, length = olaps.pop(0) except IndexError: result = _SeqFeature() else: label = "sequence" if not hasattr(other, "name") else other.name result = _SeqFeature( _FeatureLocation(start_in_self, start_in_self + length), type=kwargs.get("type") or "read", strand=1, qualifiers={ "label": [kwargs.get("label") or label], "ApEinfo_fwdcolor": ["#DAFFCF"], "ApEinfo_revcolor": ["#DFFDFF"], }, ) return result
def assemble_circular(self): cps = {} # circular assembly cpsrc = {} cpaths = sorted( _nx.simple_cycles(self.G), key=len) cpaths_sorted=[] for cpath in cpaths: order, node = min((self.G.nodes[node]["order"],node) for node in cpath) i=cpath.index(node) cpaths_sorted.append((order, cpath[i:]+cpath[:i])) cpaths_sorted.sort() for _, cp in cpaths_sorted: # cpaths is a list of nodes representing a circular assembly edgelol = [] # edgelol is a list of lists of all edges along cp cp+= cp[0:1] for u,v in zip(cp, cp[1:]): e=[] for d in self.G[u][v].values(): e.append((u,v,d)) edgelol.append(e) for edges in _itertools.product(*edgelol): if [True for ((u,v,e),(x,y,z)) in zip(edges, edges[1:]) if (e["seq"],e["piece"].stop) == (z["seq"],z["piece"].start)]: continue ct = "".join(e["seq"][e["piece"]] for u,v,e in edges) key=ct.upper() if key in cps or key in cpsrc: continue # TODO: cpsrc not needed? sg=_nx.DiGraph() sg.add_edges_from(edges) sg.add_nodes_from( (n,d) for n,d in self.G.nodes(data=True) if n in cp ) edgefeatures=[] offset=0 for u,v,e in edges: feats = _deepcopy(e["features"]) for feat in feats: feat.location+=offset edgefeatures.extend(feats) offset+=e["piece"].stop-e["piece"].start for f in edgefeatures: if f.location.start>len(ct) and f.location.end>len(ct): f.location+=(-len(ct)) elif f.location.end>len(ct): f.location = _CompoundLocation((_FeatureLocation(f.location.start,_ExactPosition(len(ct))),_FeatureLocation(_ExactPosition(0), f.location.end-len(ct)))) cps[key] = cpsrc[_rc(key)] = ct, edgefeatures, sg, {n:self.nodemap[n] for n in cp[:-1]}, cp return sorted((_Contig.from_string(cp[0], features = cp[1], graph = cp[2], nodemap = cp[3], linear=False, circular=True) for cp in cps.values()), key=len, reverse=True)
def rarecodons(self, organism="sce"): """docstring.""" rare = _rare_codons[organism] s = str(self.seq).upper() sfs = [] for i in range(0, len(self) // 3): x, y = i * 3, i * 3 + 3 trip = s[x:y] if trip in rare: sfs.append( _SeqFeature(_FeatureLocation(x, y), type=f"rare_codon_{organism}", qualifiers={"label": trip})) return sfs
def products(self): if self._products: return self._products self._products = [] for fp in self.forward_primers: for rp in self.reverse_primers: if self.template.circular: tmpl = self.template.shifted(fp.position - fp._fp) tmpl = tmpl[:] * 2 for f in tmpl.features: for x, y in zip(f.location.parts, f.location.parts[1:]): if x.end == y.start + len(self.template): f.location = _FeatureLocation( x.start, y.end + len(self.template), strand=f.location.strand, ) if fp.position > rp.position: tmpl = tmpl[:len(self.template) - fp.position + rp.position + rp._fp + fp._fp] else: tmpl = tmpl[:rp.position + rp._fp - (fp.position - fp._fp)] else: tmpl = self.template[fp.position - fp._fp:rp.position + rp._fp] prd = (_Dseqrecord(fp.tail) + tmpl + _Dseqrecord(rp.tail).reverse_complement()) full_tmpl_features = [ f for f in tmpl.features if f.location.start == 0 and f.location.end == len(tmpl) ] new_identifier = "" if full_tmpl_features: ft = full_tmpl_features[0] if "label" in ft.qualifiers: new_identifier = " ".join(ft.qualifiers["label"]) elif "note" in ft.qualifiers: new_identifier = " ".join(ft.qualifiers["note"]) from pydna.utils import ( identifier_from_string as _identifier_from_string, ) # TODO: clean this up prd.name = (_identifier_from_string(new_identifier)[:16] or self.kwargs.get("name") or "{}bp_PCR_prod".format(len(prd))[:16]) prd.id = (_identifier_from_string(new_identifier)[:16] or self.kwargs.get("id") or "{}bp {}".format( str(len(prd))[:14], prd.seguid())) prd.description = self.kwargs.get( "description") or "pcr product_{}_{}".format( fp.description, rp.description) amplicon = _Amplicon(prd, template=self.template, forward_primer=fp, reverse_primer=rp, **self.kwargs) # amplicon.forward_primer.amplicon = amplicon # amplicon.reverse_primer.amplicon = amplicon self._products.append(amplicon) return self._products
def __init__(self, primers, template, limit=13, **kwargs): r"""The Anneal class has to be initiated with at least an iterable of primers and a template. Parameters ---------- primers : iterable of :class:`Primer` or Biopython SeqRecord like objects Primer sequences 5'-3'. template : Dseqrecord The template sequence 5'-3'. limit : int, optional limit length of the annealing part of the primers. Attributes ---------- products: list A list of Amplicon objects, one for each primer pair that may form a PCR product. Examples -------- >>> from pydna.readers import read >>> from pydna.amplify import Anneal >>> from pydna.dseqrecord import Dseqrecord as Ds >>> t = Ds("tacactcaccgtctatcattatcta" ... "ctatcgactgtatcatctgatagcac") >>> from Bio.SeqRecord import SeqRecord >>> p1 = read(">p1\ntacactcaccgtctatcattatc", ds = False) >>> p2 = read(">p2\ngtgctatcagatgatacagtcg", ds = False) >>> ann = Anneal((p1, p2), t) >>> print(ann.report()) Template name 51 nt linear: p1 anneals forward (--->) at 23 p2 anneals reverse (<---) at 29 >>> ann.products [Amplicon(51)] >>> amplicon_list = ann.products >>> amplicon = amplicon_list.pop() >>> amplicon Amplicon(51) >>> print(amplicon.figure()) 5tacactcaccgtctatcattatc...cgactgtatcatctgatagcac3 |||||||||||||||||||||| 3gctgacatagtagactatcgtg5 5tacactcaccgtctatcattatc3 ||||||||||||||||||||||| 3atgtgagtggcagatagtaatag...gctgacatagtagactatcgtg5 >>> print(amplicon) Dseqrecord circular: False size: 51 ID: 51bp U96-TO06Y6pFs74SQx8M1IVTBiY Name: 51bp_PCR_prod Description: pcr product_p1_p2 Number of features: 2 /molecule_type=DNA Dseq(-51) taca..gcac atgt..cgtg >>> print(amplicon.program()) <BLANKLINE> |95°C|95°C | |tmf:59.5 |____|_____ 72°C|72°C|tmr:59.7 |5min|30s \ 47.7°C _____|____|30s/kb | | \______/ 0: 1|5min|GC 39% | | 30s | |51bp >>> """ self.primers = primers self.template = _copy.deepcopy(template) self.limit = limit self.kwargs = kwargs self._products = None self.forward_primers = [] self.reverse_primers = [] twl = len(self.template.seq.watson) tcl = len(self.template.seq.crick) if self.template.linear: tw = self.template.seq.watson tc = self.template.seq.crick else: tw = self.template.seq.watson + self.template.seq.watson tc = self.template.seq.crick + self.template.seq.crick for p in self.primers: self.forward_primers.extend(( _Primer( p, # template = self.template, position=tcl - pos - min(self.template.seq.ovhg, 0), footprint=fp, ) for pos, fp in _annealing_positions(str(p.seq), tc, self.limit) if pos < tcl)) self.reverse_primers.extend(( _Primer( p, # template = self.template, position=pos + max(0, self.template.seq.ovhg), footprint=fp, ) for pos, fp in _annealing_positions(str(p.seq), tw, self.limit) if pos < twl)) self.forward_primers.sort(key=_operator.attrgetter("position")) self.reverse_primers.sort(key=_operator.attrgetter("position"), reverse=True) for fp in self.forward_primers: if fp.position - fp._fp >= 0: start = fp.position - fp._fp end = fp.position self.template.features.append( _SeqFeature( _FeatureLocation(start, end), type="primer_bind", strand=1, qualifiers={ "label": [fp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"], }, )) else: start = len(self.template) - fp._fp + fp.position end = start + fp._fp - len(self.template) sf = _SeqFeature( _CompoundLocation([ _FeatureLocation(start, len(self.template)), _FeatureLocation(0, end), ]), type="primer_bind", location_operator="join", qualifiers={ "label": [fp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"], }, ) self.template.features.append(sf) for rp in self.reverse_primers: if rp.position + rp._fp <= len(self.template): start = rp.position end = rp.position + rp._fp self.template.features.append( _SeqFeature( _FeatureLocation(start, end), type="primer_bind", strand=-1, qualifiers={ "label": [rp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"], }, )) else: start = rp.position end = rp.position + rp._fp - len(self.template) self.template.features.append( _SeqFeature( _CompoundLocation([ _FeatureLocation(0, end), _FeatureLocation(start, len(self.template)), ]), type="primer_bind", location_operator="join", strand=-1, qualifiers={"label": [rp.name]}, ))
def map_trace_files(self, pth, limit=25): # TODO allow path-like objects import glob traces = [] for name in glob.glob(pth): trace = SeqIO.read(name, "abi").lower() trace.annotations["filename"] = trace.fname = name traces.append(trace) if not traces: raise ValueError("No trace files found in {}".format(pth)) if hasattr(self.map_target, "step"): area = self.map_target elif hasattr(self.map_target, "extract"): area = slice(self.map_target.location.start, self.map_target.location.end) else: area = None # TODO allow other objects as well and do some checks on map target if area: self.matching_reads = [] self.not_matching_reads = [] target = str(self[area].seq).lower() target_rc = str(self[area].seq.rc()).lower() for trace in traces: if target in str(trace.seq) or target_rc in str(trace.seq): self.matching_reads.append(trace) else: self.not_matching_reads.append(trace) reads = self.matching_reads else: self.matching_reads = None self.not_matching_reads = None reads = traces matching_reads = [] for read_ in reads: matches = _common_sub_strings( str(self.seq).lower(), str(read_.seq), limit) if not matches: continue if len(matches) > 1: newmatches = [ matches[0], ] for i, x in enumerate(matches[1:]): g, f, h = matches[i] if g + h < x[0] and f + h < x[1]: newmatches.append(x) else: # len(matches)==1 newmatches = matches matching_reads.append(read_) if len(newmatches) > 1: ms = [] for m in newmatches: ms.append(_FeatureLocation(m[0], m[0] + m[2])) loc = _CompoundLocation(ms) else: a, b, c = newmatches[0] loc = _FeatureLocation(a, a + c) self.features.append( _SeqFeature( loc, qualifiers={"label": [read_.annotations["filename"]]}, type="trace", )) return [x.annotations["filename"] for x in matching_reads]
def shifted(self, shift): """Returns a circular Dseqrecord with a new origin <shift>. This only works on circular Dseqrecords. If we consider the following circular sequence: | ``GAAAT <-- watson strand`` | ``CTTTA <-- crick strand`` The T and the G on the watson strand are linked together as well as the A and the C of the of the crick strand. if ``shift`` is 1, this indicates a new origin at position 1: | new origin at the | symbol: | | ``G|AAAT`` | ``C|TTTA`` new sequence: | ``AAATG`` | ``TTTAC`` Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("aaat",circular=True) >>> a Dseqrecord(o4) >>> a.seq Dseq(o4) aaat ttta >>> b=a.shifted(1) >>> b Dseqrecord(o4) >>> b.seq Dseq(o4) aata ttat """ if self.linear: raise TypeError( "Sequence is linear, origin can only be shifted for circular sequences.\n" ) ln = len(self) if not shift % ln: return self # shift is a multiple of ln or 0 else: shift %= ln # 0<=shift<=ln newseq = (self.seq[shift:] + self.seq[:shift]).looped() shift = ln - shift newfeatures = [] for feature in self.features: shiftedparts = [ featurelocation + shift for featurelocation in feature.location.parts ] zero_length_parts = [ featurelocation for featurelocation in shiftedparts if featurelocation.start == featurelocation.end ] newparts = [] for location in shiftedparts: newstart = location.start % ln newend = location.end % ln if newstart < newend: newparts.append( _FeatureLocation( newstart, newend, location.strand, location.ref, location.ref_db, )) elif newstart > newend: if location.strand == 1: newparts.extend([ _FeatureLocation( newstart, ln, location.strand, location.ref, location.ref_db, ), _FeatureLocation( 0, newend, location.strand, location.ref, location.ref_db, ), ]) else: newparts.extend([ _FeatureLocation( 0, newend, location.strand, location.ref, location.ref_db, ), _FeatureLocation( newstart, ln, location.strand, location.ref, location.ref_db, ), ]) p = next((p for p in newparts if p.end == shift), None) s = next((p for p in newparts if p.start == shift), None) if p and s: newparts.remove(p) newparts[newparts.index(s)] = _FeatureLocation( p.start, s.end, p.strand, p.ref, p.ref_db) newparts = [p for p in newparts if p] newparts.extend(zero_length_parts) if newparts: newfeatures.append( _SeqFeature( location=sum(newparts), type=feature.type, id=feature.id, qualifiers=feature.qualifiers, )) newfeatures.sort(key=_operator.attrgetter("location.start")) answer = _copy.copy(self) answer.features = newfeatures answer.seq = newseq return answer
def add_feature(self, x=None, y=None, seq=None, type="misc", strand=1, *args, **kwargs): # location=None, # type='', # location_operator='', # strand=None, # id="<unknown id>", # qualifiers=None, # sub_features=None, # ref=None, # ref_db=None '''Adds a feature of type misc to the feature list of the sequence. Parameters ---------- x : int Indicates start of the feature y : int Indicates end of the feature Examples -------- >>> from pydna.seqrecord import SeqRecord >>> a=SeqRecord("atgtaa") >>> a.features [] >>> a.add_feature(2,4) >>> a.features [SeqFeature(FeatureLocation(ExactPosition(2), ExactPosition(4), strand=1), type='misc')] ''' qualifiers = {} qualifiers.update(kwargs) if seq: if hasattr(seq, "seq"): seq = seq.seq if hasattr(seq, "watson"): seq = str(seq.watson).lower() else: seq = str(seq).lower() else: seq = str(seq).lower() x = self.seq.lower().find(seq) if x == -1: raise TypeError("Could not find {}".format(seq)) y = x + len(seq) else: x = x or 0 y = y or len(self) if "label" not in qualifiers: qualifiers["label"] = ["ft{}".format(y - x)] if self[x:y].isorf() or self[x:y].reverse_complement().isorf(): qualifiers["label"] = ["orf{}".format(y - x)] sf = _SeqFeature(_FeatureLocation(x, y, strand=strand), type=type, qualifiers=qualifiers) self.features.append(sf) '''
def __init__( self, primers, template, limit=13, primerc=1000.0, # nM saltc=50, # mM **kwargs): r'''The Anneal class has to be initiated with at least an iterable of primers and a template. Parameters ---------- primers : iterable of :class:`Primer` or Biopython SeqRecord like objects Primer sequences 5'-3'. template : Dseqrecord The template sequence 5'-3'. limit : int, optional limit length of the annealing part of the primers. fprimerc : float, optional Concentration of forward primer in nM, set to 1000.0 nM by default rprimerc : float, optional Concentration of reverse primer in nM, set to 1000.0 nM by default saltc : float, optional Salt concentration (monovalet cations) :mod:`tmbresluc` set to 50.0 mM by default Attributes ---------- products: list A list of Amplicon objects, one for each primer pair that may form a PCR product. Examples -------- >>> from pydna.readers import read >>> from pydna.amplify import Anneal >>> from pydna.dseqrecord import Dseqrecord >>> template = Dseqrecord("tacactcaccgtctatcattatctactatcgactgtatcatctgatagcac") >>> from Bio.SeqRecord import SeqRecord >>> p1 = read(">p1\ntacactcaccgtctatcattatc", ds = False) >>> p2 = read(">p2\ngtgctatcagatgatacagtcg", ds = False) >>> ann = Anneal((p1, p2), template) >>> print(ann.report()) Template name 51 nt linear: Primer p1 anneals forward at position 23 <BLANKLINE> Primer p2 anneals reverse at position 29 >>> ann.products [Amplicon(51)] >>> amplicon_list = ann.products >>> amplicon = amplicon_list.pop() >>> amplicon Amplicon(51) >>> print(amplicon.figure()) 5tacactcaccgtctatcattatc...cgactgtatcatctgatagcac3 |||||||||||||||||||||| tm 55.9 (dbd) 60.5 3gctgacatagtagactatcgtg5 5tacactcaccgtctatcattatc3 ||||||||||||||||||||||| tm 54.6 (dbd) 58.8 3atgtgagtggcagatagtaatag...gctgacatagtagactatcgtg5 >>> amplicon.annotations['date'] = '02-FEB-2013' # Set the date for this example to pass the doctest >>> print(amplicon) Dseqrecord circular: False size: 51 ID: 51bp U96-TO06Y6pFs74SQx8M1IVTBiY Name: 51bp_PCR_prod Description: pcr product_p1_p2 Number of features: 2 /date=02-FEB-2013 Dseq(-51) taca..gcac atgt..cgtg >>> print(amplicon.program()) <BLANKLINE> Taq (rate 30 nt/s) 35 cycles |51bp 95.0°C |95.0°C | |Tm formula: Biopython Tm_NN |_________|_____ 72.0°C |72.0°C|SaltC 50mM | 03min00s|30s \ ________|______|Primer1C 1.0µM | | \ 45.4°C/ 0min 2s| 5min |Primer2C 1.0µM | | \_____/ | |GC 39% | | 30s | |4-12°C >>> ''' self.primers = primers self.primerc = primerc self.saltc = saltc self.template = _copy.deepcopy(template) self.limit = limit self.kwargs = defaultdict(str, kwargs) self._products = None self.forward_primers = [] self.reverse_primers = [] twl = len(self.template.seq.watson) tcl = len(self.template.seq.crick) if self.template.linear: tw = self.template.seq.watson tc = self.template.seq.crick else: tw = self.template.seq.watson + self.template.seq.watson tc = self.template.seq.crick + self.template.seq.crick for p in self.primers: self.forward_primers.extend(( _Primer(p, position=tcl - pos - min(self.template.seq.ovhg, 0), footprint=fp) for pos, fp in _annealing_positions(str(p.seq), tc, self.limit) if pos < tcl)) self.reverse_primers.extend(( _Primer(p, position=pos + max(0, self.template.seq.ovhg), footprint=fp) for pos, fp in _annealing_positions(str(p.seq), tw, self.limit) if pos < twl)) self.forward_primers.sort(key=_operator.attrgetter('position')) self.reverse_primers.sort(key=_operator.attrgetter('position'), reverse=True) for fp in self.forward_primers: if fp.position - fp._fp >= 0: start = fp.position - fp._fp end = fp.position self.template.features.append( _SeqFeature(_FeatureLocation(start, end), type="primer_bind", strand=1, qualifiers={ "label": [fp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"] })) else: start = len(self.template) - fp._fp + fp.position end = start + fp._fp - len(self.template) sf = _SeqFeature(_CompoundLocation([ _FeatureLocation(start, len(self.template)), _FeatureLocation(0, end) ]), type="primer_bind", location_operator="join", qualifiers={ "label": [fp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"] }) self.template.features.append(sf) for rp in self.reverse_primers: if rp.position + rp._fp <= len(self.template): start = rp.position end = rp.position + rp._fp self.template.features.append( _SeqFeature(_FeatureLocation(start, end), type="primer_bind", strand=-1, qualifiers={ "label": [rp.name], "ApEinfo_fwdcolor": ["#baffa3"], "ApEinfo_revcolor": ["#ffbaba"] })) else: start = rp.position end = rp.position + rp._fp - len(self.template) self.template.features.append( _SeqFeature(_CompoundLocation([ _FeatureLocation(0, end), _FeatureLocation(start, len(self.template)) ]), type="primer_bind", location_operator="join", strand=-1, qualifiers={"label": [rp.name]}))