def calculate(self, instr): vals = None if isinstance(instr,_Seq): vals = self.pssm.calculate(instr) else: vals = self.pssm.calculate(_Seq(instr,_unamb_dna)) energy = self.pssm.calculate(self.consensus) -vals return energy /_S.log2(_S.e) # biopython uses log base 2, but GEMSTAT uses log base e #TODO: Make it automatically determine what the base of biopython log is by creating a special pwm.
def __init__( self, record, *args, amplicon=None, position=None, footprint=0, **kwargs ): if hasattr(record, "features"): for key, value in record.__dict__.items(): setattr(self, key, value) elif hasattr(record, "transcribe"): super().__init__(record, *args, **kwargs) else: super().__init__(_Seq(record), *args, **kwargs) self.position = position self._fp = footprint or len(record)
def __init__(self, record, *args, template = None, position = None, footprint = 0, concentration = 1000.0, # nM (= 1 µM) **kwargs): if hasattr(record, "features"): for key, value in record.__dict__.items(): setattr(self, key, value ) elif hasattr(record, "alphabet"): super().__init__(record, *args, **kwargs) else: super().__init__(_Seq(record, _IUPACAmbiguousDNA()), *args, **kwargs) self.concentration = concentration self.position = position self._fp = footprint self.template = template
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if len(self.name) > 16: short_name = self.name[:16] _warn("name property {} truncated to 16 chars {}".format( self.name, short_name), _PydnaWarning, stacklevel=2) self.name = short_name if self.name == "<unknown name>": self.name = "name" if self.id == "<unknown id>": self.id = "id" if self.description == "<unknown description>": self.description = "description" #if not 'date' in self.annotations: # self.annotations.update({"date": _datetime.date.today().strftime("%d-%b-%Y").upper()}) self.map_target = None if not hasattr(self.seq, "alphabet"): self.seq = _Seq(self.seq, _IUPACAmbiguousDNA()) self.seq._data = "".join(self.seq._data.split()) # remove whitespaces self.id = _pretty_str(self.id) self.name = _pretty_str(self.name) self.description = _pretty_str(self.description) self.annotations = { _pretty_str(k): _pretty_str(v) for k, v in self.annotations.items() }
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.annotations.update({"molecule_type": "DNA"}) if len(self.name) > 16: short_name = self.name[:16] _warn( "name property {} truncated to 16 chars {}".format( self.name, short_name), _PydnaWarning, stacklevel=2, ) self.name = short_name if self.name == "<unknown name>": self.name = "name" if self.id == "<unknown id>": self.id = "id" if self.description == "<unknown description>": self.description = "description" self.map_target = None if not hasattr(self.seq, "transcribe"): self.seq = _Seq(self.seq) self.seq._data = "".join(self.seq._data.split()) # remove whitespaces # self.seq.alphabet = _generic_dna self.id = _pretty_str(self.id) self.name = _pretty_str(self.name) self.description = _pretty_str(self.description) self.annotations = { _pretty_str(k): _pretty_str(v) for k, v in self.annotations.items() }
def pcr(*args, **kwargs): """pcr is a convenience function for the Anneal class to simplify its usage, especially from the command line. If more than one or no PCR product is formed, a ValueError is raised. args is any iterable of Dseqrecords or an iterable of iterables of Dseqrecords. args will be greedily flattened. Parameters ---------- args : iterable containing sequence objects Several arguments are also accepted. limit : int = 13, optional limit length of the annealing part of the primers. Notes ----- sequences in args could be of type: * string * Seq * SeqRecord (or subclass) * Dseqrecord (or sublcass) The last sequence will be assumed to be the template while all preceeding sequences will be assumed to be primers. This is a powerful function, use with care! Returns ------- product : Amplicon An :class:`pydna.amplicon.Amplicon` object representing the PCR product. The direction of the PCR product will be the same as for the template sequence. Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> from pydna.readers import read >>> from pydna.amplify import pcr >>> from pydna.primer import Primer >>> template = Dseqrecord("tacactcaccgtctatcattatctac\ tatcgactgtatcatctgatagcac") >>> from Bio.SeqRecord import SeqRecord >>> p1 = Primer("tacactcaccgtctatcattatc") >>> p2 = Primer("cgactgtatcatctgatagcac").reverse_complement() >>> pcr(p1, p2, template) Amplicon(51) >>> pcr([p1, p2], template) Amplicon(51) >>> pcr((p1,p2,), template) Amplicon(51) >>> """ output = _flatten(args) # flatten new = [] for s in output: if hasattr(s, "watson"): s = _SeqRecord(_Seq(s.watson)) elif hasattr(s, "transcribe"): s = _SeqRecord(s) elif isinstance(s, str): s = _SeqRecord(_Seq(s)) elif hasattr(s, "features"): pass else: raise TypeError("arguments need to be a string, Bio.Seq, SeqRecord" ", Primer, Dseqrecord or Amplicon object") new.append(s) # A single Amplicon object if len(new) == 1 and hasattr(new[0], "forward_primer"): new = [new[0].forward_primer, new[0].reverse_primer, new[0]] if not hasattr(new[-1].seq, "watson"): new[-1] = _Dseqrecord(s) anneal_primers = Anneal(new[:-1], new[-1], **kwargs) if len(anneal_primers.products) == 1: return anneal_primers.products[0] elif len(anneal_primers.products) == 0: raise ValueError("No PCR product! {}".format(anneal_primers.report())) raise ValueError("PCR not specific! {}".format(anneal_primers.report()))
def cut(self, *enzymes): """Returns a list of linear Dseq fragments produced in the digestion. If there are no cuts, an empty list is returned. Parameters ---------- enzymes : enzyme object or iterable of such objects A Bio.Restriction.XXX restriction objects or iterable. Returns ------- frags : list list of Dseq objects formed by the digestion Examples -------- >>> from pydna.dseq import Dseq >>> seq=Dseq("ggatccnnngaattc") >>> seq Dseq(-15) ggatccnnngaattc cctaggnnncttaag >>> from Bio.Restriction import BamHI,EcoRI >>> type(seq.cut(BamHI)) <class 'tuple'> >>> for frag in seq.cut(BamHI): print(repr(frag)) Dseq(-5) g cctag Dseq(-14) gatccnnngaattc gnnncttaag >>> seq.cut(EcoRI, BamHI) == seq.cut(BamHI, EcoRI) True >>> a,b,c = seq.cut(EcoRI, BamHI) >>> a+b+c Dseq(-15) ggatccnnngaattc cctaggnnncttaag >>> """ pad = "n" * 50 if self.linear: dsseq = self.mung() else: dsseq = Dseq.from_string(self._data, linear=True, circular=False) if len(enzymes) == 1 and hasattr(enzymes[0], "intersection"): # RestrictionBatch enzymecuts = [] for e in enzymes[0]: # cuts = e.search(dsseq+dsseq[:e.size-1] if self.circular else dsseq) cuts = e.search( _Seq(pad + dsseq.watson + dsseq.watson[:e.size - 1] + pad) if self.circular else dsseq) enzymecuts.append((cuts, e)) enzymecuts.sort() enzymes = [e for (c, e) in enzymecuts if c] else: enzymes = [ e for e in list(dict.fromkeys(_flatten(enzymes))) if e.search( _Seq(pad + dsseq.watson + dsseq.watson[:e.size - 1] + pad) if self.circular else dsseq) ] # flatten if not enzymes: return () if self.linear: frags = [self] else: l = len(self) for e in enzymes: wpos = [ x - len(pad) - 1 for x in e.search( _Seq(pad + self.watson + self.watson[:e.size - 1]) + pad) ][::-1] cpos = [ x - len(pad) - 1 for x in e.search( _Seq(pad + self.crick + self.crick[:e.size - 1]) + pad) ][::-1] for w, c in _itertools.product(wpos, cpos): if w % len(self) == (self.length - c + e.ovhg) % len(self): frags = [ Dseq( self.watson[w % l:] + self.watson[:w % l], self.crick[c % l:] + self.crick[:c % l], ovhg=e.ovhg, pos=w, ) ] break else: continue break newfrags = [] for enz in enzymes: for frag in frags: ws = [x - 1 for x in enz.search(_Seq(frag.watson) + "N")] cs = [x - 1 for x in enz.search(_Seq(frag.crick) + "N")] sitepairs = [(sw, sc) for sw, sc in _itertools.product(ws, cs[::-1]) if (sw + max(0, frag.ovhg) - max(0, enz.ovhg) == len(frag.crick) - sc - min(0, frag.ovhg) + min(0, enz.ovhg))] sitepairs.append((self.length, 0)) w2, c1 = sitepairs[0] newfrags.append( Dseq(frag.watson[:w2], frag.crick[c1:], ovhg=frag.ovhg, pos=frag.pos)) for (w1, c2), (w2, c1) in zip(sitepairs[:-1], sitepairs[1:]): newfrags.append( Dseq( frag.watson[w1:w2], frag.crick[c1:c2], ovhg=enz.ovhg, pos=frag.pos + w1 - max(0, enz.ovhg), )) frags = newfrags newfrags = [] return tuple(frags)
def align(self, insert_size = False, path_to_exe = False, local_alns_path = ['alignments'], force = False, max_cpus = -1): if not path_to_exe: path_to_exe = _get_exe_path('bwa') # write genome sequence to a fasta file try: _os.makedirs('genome_sequences') except OSError: pass genome_fna = 'genome_sequences/%s.fna' % self.genome_id _SeqIO.write(_SeqRecord(_Seq(self.genome_sequence.tostring()), id = self.genome_id), genome_fna, 'fasta') # make folder for alignments (BAMs) local_alns_path = _os.path.sep.join(local_alns_path) if not _os.path.exists(local_alns_path): _os.makedirs(local_alns_path) # make a subdir for this genome local_alns_path_genome = _os.path.sep.join([ local_alns_path, self.genome_id]) if not _os.path.exists(local_alns_path_genome): _os.makedirs(local_alns_path_genome) max_processes = _decide_max_processes( max_cpus ) e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.' assert hasattr(self, 'read_files'), e1 e2 = 'Could not find %s. Either run trim() again or ensure file exists' for pairname, files in self.read_files.items(): assert _os.path.exists(files[1]), e2 % files[1] assert _os.path.exists(files[2]), e2 % files[2] have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa')] if not all(have_index_files): print('Writing BWA index files for %s' % genome_fna) _subprocess.call([path_to_exe, 'index', genome_fna]) aligned_read_files = {} for pairname,files in self.read_files.items(): RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname,pairname) if insert_size: cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2]] else: # BWA can estimate on-the-fly cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2]] out_sam = _os.path.sep.join([local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id)]) if not _os.path.exists(out_sam) or force: print('Called: "%s"' % ' '.join(cmd)) with open(out_sam, "wb") as out: _subprocess.call(cmd, stdout = out) else: print('Found:') print(out_sam) print('use "force = True" to overwrite') print(' '.join(cmd)) aligned_read_files[pairname] = out_sam self.aligned_read_files = aligned_read_files
def align(self, insert_size=False, path_to_exe=False, local_alns_path=['alignments'], force=False, max_cpus=-1): if not path_to_exe: path_to_exe = _get_exe_path('bwa') # write genome sequence to a fasta file try: _os.makedirs('genome_sequences') except OSError: pass genome_fna = 'genome_sequences/%s.fna' % self.genome_id _SeqIO.write( _SeqRecord(_Seq(self.genome_sequence.tostring()), id=self.genome_id), genome_fna, 'fasta') # make folder for alignments (BAMs) local_alns_path = _os.path.sep.join(local_alns_path) if not _os.path.exists(local_alns_path): _os.makedirs(local_alns_path) # make a subdir for this genome local_alns_path_genome = _os.path.sep.join( [local_alns_path, self.genome_id]) if not _os.path.exists(local_alns_path_genome): _os.makedirs(local_alns_path_genome) max_processes = _decide_max_processes(max_cpus) e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.' assert hasattr(self, 'read_files'), e1 e2 = 'Could not find %s. Either run trim() again or ensure file exists' for pairname, files in self.read_files.items(): assert _os.path.exists(files[1]), e2 % files[1] assert _os.path.exists(files[2]), e2 % files[2] # always (re)index in case of upstream changes in data print('Writing BWA index files for %s' % genome_fna) _subprocess.call([path_to_exe, 'index', genome_fna]) aligned_read_files = {} for pairname, files in self.read_files.items(): RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname, pairname) if insert_size: cmd = [ path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2] ] else: # BWA can estimate on-the-fly cmd = [ path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2] ] out_sam = _os.path.sep.join([ local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id) ]) if not _os.path.exists(out_sam) or force: print('Called: "%s"' % ' '.join(cmd)) with open(out_sam, "wb") as out: _subprocess.call(cmd, stdout=out) else: print('Found:') print(out_sam) print('use "force = True" to overwrite') print(' '.join(cmd)) aligned_read_files[pairname] = out_sam self.aligned_read_files = aligned_read_files
def generateSequences(self): ''' Create full length sequences with generated variants applied to the reference sequence. Generated variants are saved to a csv. If large deletions are present, variant positions appropriately are corrected when applied. ''' save_these = {} save_these['SNPs'] = self.SNPs_per_genome save_these['InDels'] = self.indel_dict_by_pos_pergenome if len(self.large_deletions): # generate a version of reference genome with large deletions ranges = sorted(self.large_deletions.values()) genome_large_deletions = self.genome.sequence[:ranges[0][0]] for n,(s,e) in enumerate(ranges[:-1]): genome_large_deletions.extend(self.genome.sequence[e:ranges[n+1][0]]) genome_large_deletions.extend(self.genome.sequence[ranges[n+1][1]:]) # adjust generated variant positions for geneome with deletions def adjust(pos0): offset = 0 for s,e in ranges: if pos0 > e: offset += (e-s) return(pos0 - offset) # adjust the second half of the generated variants SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.num_individuals] for SNPs in self.SNPs_per_genome[self.num_individuals:]: adjusted = [] for pos0,variant in SNPs: adjusted += [(adjust(pos0),variant)] SNPs_per_genome_adjusted += [adjusted] indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:self.num_individuals] for indels in self.indel_dict_by_pos_pergenome[self.num_individuals:]: adjusted = {} for pos0,indel in indels.items(): adjusted[adjust(pos0)] = indel indel_dict_by_pos_pergenome_adjusted += [adjusted] save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted save_these['InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted SNPs_per_genome = SNPs_per_genome_adjusted indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted else: SNPs_per_genome = self.SNPs_per_genome indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome # adjusted are needed to apply the variants # unadjusted are needed to check the calling _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p','w')) # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r')) ### generate genotypes (apply variants) ### genotypes = [] for gn,SNPs in enumerate(SNPs_per_genome): if len(self.large_deletions) and gn >= self.num_individuals: # use genome with large deletions for second batch orig_genome = genome_large_deletions genome = _array('c',genome_large_deletions) else: # else use original orig_genome = self.genome.sequence genome = _array('c',self.genome.sequence) # first SNPs for pos0,SNP in SNPs: assert genome[pos0] != SNP genome[pos0] = SNP # check it worked changed = [pos0 for pos0,(new,old) in enumerate(zip(genome,list(orig_genome))) if new != old] assert changed == [pos0 for pos0,var in SNPs], 'SNPs were not correctly applied . . .' # then indels newgenome = _array('c') last_pos0 = 0 for pos0,indel in sorted(indel_dict_by_pos_pergenome[gn].items()): if isinstance(indel,str): # insertion newgenome.extend(genome[last_pos0:pos0]) newgenome.extend(indel) last_pos0 = pos0 else: # deletion newgenome.extend(genome[last_pos0:pos0]) last_pos0 = pos0 + indel newgenome.extend(genome[last_pos0:]) genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), id = self.genome.id+'_sim{:02d}'.format(gn+1), name = '', description = '') genotypes += [genome_seqrecord] print(len(self.genome.sequence),len(genotypes[-1]),genotypes[-1].id) self.genotypes = genotypes
def generateSequences(self): ''' Create full length sequences with generated variants applied to the reference sequence. Generated variants are saved to a csv. If large deletions are present, variant positions appropriately are corrected when applied. ''' save_these = {} save_these['SNPs'] = self.SNPs_per_genome save_these['InDels'] = self.indel_dict_by_pos_pergenome if len(self.large_deletions): # generate a version of reference genome with large deletions ranges = sorted(self.large_deletions.values()) genome_large_deletions = self.genome.sequence[:ranges[0][0]] for n, (s, e) in enumerate(ranges[:-1]): genome_large_deletions.extend( self.genome.sequence[e:ranges[n + 1][0]]) genome_large_deletions.extend(self.genome.sequence[ranges[n + 1][1]:]) # adjust generated variant positions for geneome with deletions def adjust(pos0): offset = 0 for s, e in ranges: if pos0 > e: offset += (e - s) return (pos0 - offset) # adjust the second half of the generated variants SNPs_per_genome_adjusted = self.SNPs_per_genome[:self. num_individuals] for SNPs in self.SNPs_per_genome[self.num_individuals:]: adjusted = [] for pos0, variant in SNPs: adjusted += [(adjust(pos0), variant)] SNPs_per_genome_adjusted += [adjusted] indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[: self . num_individuals] for indels in self.indel_dict_by_pos_pergenome[self. num_individuals:]: adjusted = {} for pos0, indel in indels.items(): adjusted[adjust(pos0)] = indel indel_dict_by_pos_pergenome_adjusted += [adjusted] save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted save_these[ 'InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted SNPs_per_genome = SNPs_per_genome_adjusted indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted else: SNPs_per_genome = self.SNPs_per_genome indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome # adjusted are needed to apply the variants # unadjusted are needed to check the calling _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p', 'w')) # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r')) ### generate genotypes (apply variants) ### genotypes = [] for gn, SNPs in enumerate(SNPs_per_genome): if len(self.large_deletions) and gn >= self.num_individuals: # use genome with large deletions for second batch orig_genome = genome_large_deletions genome = _array('c', genome_large_deletions) else: # else use original orig_genome = self.genome.sequence genome = _array('c', self.genome.sequence) # first SNPs for pos0, SNP in SNPs: assert genome[pos0] != SNP genome[pos0] = SNP # check it worked changed = [ pos0 for pos0, (new, old) in enumerate(zip(genome, list(orig_genome))) if new != old ] assert changed == [pos0 for pos0, var in SNPs ], 'SNPs were not correctly applied . . .' # then indels newgenome = _array('c') last_pos0 = 0 for pos0, indel in sorted(indel_dict_by_pos_pergenome[gn].items()): if isinstance(indel, str): # insertion newgenome.extend(genome[last_pos0:pos0]) newgenome.extend(indel) last_pos0 = pos0 else: # deletion newgenome.extend(genome[last_pos0:pos0]) last_pos0 = pos0 + indel newgenome.extend(genome[last_pos0:]) genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), id=self.genome.id + '_sim{:02d}'.format(gn + 1), name='', description='') genotypes += [genome_seqrecord] print(len(self.genome.sequence), len(genotypes[-1]), genotypes[-1].id) self.genotypes = genotypes