Exemple #1
0
	def calculate(self, instr):
		vals = None
		if isinstance(instr,_Seq):
			vals = self.pssm.calculate(instr)
		else:
			vals = self.pssm.calculate(_Seq(instr,_unamb_dna))
		energy = self.pssm.calculate(self.consensus) -vals
		return energy /_S.log2(_S.e) # biopython uses log base 2, but GEMSTAT uses log base e #TODO: Make it automatically determine what the base of biopython log is by creating a special pwm.
Exemple #2
0
    def __init__(
        self, record, *args, amplicon=None, position=None, footprint=0, **kwargs
    ):

        if hasattr(record, "features"):
            for key, value in record.__dict__.items():
                setattr(self, key, value)
        elif hasattr(record, "transcribe"):
            super().__init__(record, *args, **kwargs)
        else:
            super().__init__(_Seq(record), *args, **kwargs)

        self.position = position
        self._fp = footprint or len(record)
Exemple #3
0
 def __init__(self, record, 
              *args,
              template  = None,
              position  = None, 
              footprint = 0,
              concentration = 1000.0,   # nM (= 1 µM)
              **kwargs):
     if hasattr(record, "features"):
         for key, value in record.__dict__.items():
             setattr(self, key, value )
     elif hasattr(record, "alphabet"):
         super().__init__(record, *args, **kwargs)            
     else:        
         super().__init__(_Seq(record, _IUPACAmbiguousDNA()), *args, **kwargs)
     self.concentration = concentration           
     self.position      = position
     self._fp           = footprint
     self.template      = template
Exemple #4
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if len(self.name) > 16:
            short_name = self.name[:16]
            _warn("name property {} truncated to 16 chars {}".format(
                self.name, short_name),
                  _PydnaWarning,
                  stacklevel=2)
            self.name = short_name

        if self.name == "<unknown name>":
            self.name = "name"

        if self.id == "<unknown id>":
            self.id = "id"

        if self.description == "<unknown description>":
            self.description = "description"

        #if not 'date' in self.annotations:
        #    self.annotations.update({"date": _datetime.date.today().strftime("%d-%b-%Y").upper()})

        self.map_target = None

        if not hasattr(self.seq, "alphabet"):
            self.seq = _Seq(self.seq, _IUPACAmbiguousDNA())

        self.seq._data = "".join(self.seq._data.split())  # remove whitespaces

        self.id = _pretty_str(self.id)
        self.name = _pretty_str(self.name)
        self.description = _pretty_str(self.description)
        self.annotations = {
            _pretty_str(k): _pretty_str(v)
            for k, v in self.annotations.items()
        }
Exemple #5
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.annotations.update({"molecule_type": "DNA"})
        if len(self.name) > 16:
            short_name = self.name[:16]
            _warn(
                "name property {} truncated to 16 chars {}".format(
                    self.name, short_name),
                _PydnaWarning,
                stacklevel=2,
            )
            self.name = short_name

        if self.name == "<unknown name>":
            self.name = "name"

        if self.id == "<unknown id>":
            self.id = "id"

        if self.description == "<unknown description>":
            self.description = "description"

        self.map_target = None

        if not hasattr(self.seq, "transcribe"):
            self.seq = _Seq(self.seq)

        self.seq._data = "".join(self.seq._data.split())  # remove whitespaces
        # self.seq.alphabet = _generic_dna
        self.id = _pretty_str(self.id)
        self.name = _pretty_str(self.name)
        self.description = _pretty_str(self.description)
        self.annotations = {
            _pretty_str(k): _pretty_str(v)
            for k, v in self.annotations.items()
        }
Exemple #6
0
def pcr(*args, **kwargs):
    """pcr is a convenience function for the Anneal class to simplify its
    usage, especially from the command line. If more than one or no PCR
    product is formed, a ValueError is raised.

    args is any iterable of Dseqrecords or an iterable of iterables of
    Dseqrecords. args will be greedily flattened.

    Parameters
    ----------

    args : iterable containing sequence objects
        Several arguments are also accepted.

    limit : int = 13, optional
        limit length of the annealing part of the primers.

    Notes
    -----

    sequences in args could be of type:

    * string
    * Seq
    * SeqRecord (or subclass)
    * Dseqrecord (or sublcass)

    The last sequence will be assumed to be the template while
    all preceeding sequences will be assumed to be primers.

    This is a powerful function, use with care!

    Returns
    -------

    product : Amplicon
        An :class:`pydna.amplicon.Amplicon` object representing the PCR
        product. The direction of the PCR product will be the same as for
        the template sequence.

    Examples
    --------

    >>> from pydna.dseqrecord import Dseqrecord
    >>> from pydna.readers import read
    >>> from pydna.amplify import pcr
    >>> from pydna.primer import Primer
    >>> template = Dseqrecord("tacactcaccgtctatcattatctac\
tatcgactgtatcatctgatagcac")
    >>> from Bio.SeqRecord import SeqRecord
    >>> p1 = Primer("tacactcaccgtctatcattatc")
    >>> p2 = Primer("cgactgtatcatctgatagcac").reverse_complement()
    >>> pcr(p1, p2, template)
    Amplicon(51)
    >>> pcr([p1, p2], template)
    Amplicon(51)
    >>> pcr((p1,p2,), template)
    Amplicon(51)
    >>>

    """

    output = _flatten(args)  # flatten
    new = []
    for s in output:
        if hasattr(s, "watson"):
            s = _SeqRecord(_Seq(s.watson))
        elif hasattr(s, "transcribe"):
            s = _SeqRecord(s)
        elif isinstance(s, str):
            s = _SeqRecord(_Seq(s))
        elif hasattr(s, "features"):
            pass
        else:
            raise TypeError("arguments need to be a string, Bio.Seq, SeqRecord"
                            ", Primer, Dseqrecord or Amplicon object")
        new.append(s)

    # A single Amplicon object
    if len(new) == 1 and hasattr(new[0], "forward_primer"):
        new = [new[0].forward_primer, new[0].reverse_primer, new[0]]

    if not hasattr(new[-1].seq, "watson"):
        new[-1] = _Dseqrecord(s)

    anneal_primers = Anneal(new[:-1], new[-1], **kwargs)

    if len(anneal_primers.products) == 1:
        return anneal_primers.products[0]
    elif len(anneal_primers.products) == 0:
        raise ValueError("No PCR product! {}".format(anneal_primers.report()))
    raise ValueError("PCR not specific! {}".format(anneal_primers.report()))
Exemple #7
0
    def cut(self, *enzymes):
        """Returns a list of linear Dseq fragments produced in the digestion.
        If there are no cuts, an empty list is returned.

        Parameters
        ----------

        enzymes : enzyme object or iterable of such objects
            A Bio.Restriction.XXX restriction objects or iterable.

        Returns
        -------
        frags : list
            list of Dseq objects formed by the digestion


        Examples
        --------

        >>> from pydna.dseq import Dseq
        >>> seq=Dseq("ggatccnnngaattc")
        >>> seq
        Dseq(-15)
        ggatccnnngaattc
        cctaggnnncttaag
        >>> from Bio.Restriction import BamHI,EcoRI
        >>> type(seq.cut(BamHI))
        <class 'tuple'>
        >>> for frag in seq.cut(BamHI): print(repr(frag))
        Dseq(-5)
        g
        cctag
        Dseq(-14)
        gatccnnngaattc
            gnnncttaag
        >>> seq.cut(EcoRI, BamHI) ==  seq.cut(BamHI, EcoRI)
        True
        >>> a,b,c = seq.cut(EcoRI, BamHI)
        >>> a+b+c
        Dseq(-15)
        ggatccnnngaattc
        cctaggnnncttaag
        >>>

        """

        pad = "n" * 50

        if self.linear:
            dsseq = self.mung()
        else:
            dsseq = Dseq.from_string(self._data, linear=True, circular=False)

        if len(enzymes) == 1 and hasattr(enzymes[0],
                                         "intersection"):  # RestrictionBatch
            enzymecuts = []
            for e in enzymes[0]:
                # cuts = e.search(dsseq+dsseq[:e.size-1] if self.circular else dsseq)
                cuts = e.search(
                    _Seq(pad + dsseq.watson + dsseq.watson[:e.size - 1] +
                         pad) if self.circular else dsseq)
                enzymecuts.append((cuts, e))
            enzymecuts.sort()
            enzymes = [e for (c, e) in enzymecuts if c]
        else:
            enzymes = [
                e for e in list(dict.fromkeys(_flatten(enzymes))) if e.search(
                    _Seq(pad + dsseq.watson + dsseq.watson[:e.size - 1] +
                         pad) if self.circular else dsseq)
            ]  # flatten

        if not enzymes:
            return ()

        if self.linear:
            frags = [self]
        else:
            l = len(self)
            for e in enzymes:
                wpos = [
                    x - len(pad) - 1 for x in e.search(
                        _Seq(pad + self.watson + self.watson[:e.size - 1]) +
                        pad)
                ][::-1]
                cpos = [
                    x - len(pad) - 1 for x in e.search(
                        _Seq(pad + self.crick + self.crick[:e.size - 1]) + pad)
                ][::-1]

                for w, c in _itertools.product(wpos, cpos):
                    if w % len(self) == (self.length - c + e.ovhg) % len(self):
                        frags = [
                            Dseq(
                                self.watson[w % l:] + self.watson[:w % l],
                                self.crick[c % l:] + self.crick[:c % l],
                                ovhg=e.ovhg,
                                pos=w,
                            )
                        ]
                        break
                else:
                    continue
                break

        newfrags = []

        for enz in enzymes:
            for frag in frags:

                ws = [x - 1 for x in enz.search(_Seq(frag.watson) + "N")]
                cs = [x - 1 for x in enz.search(_Seq(frag.crick) + "N")]

                sitepairs = [(sw, sc)
                             for sw, sc in _itertools.product(ws, cs[::-1])
                             if (sw + max(0, frag.ovhg) -
                                 max(0, enz.ovhg) == len(frag.crick) - sc -
                                 min(0, frag.ovhg) + min(0, enz.ovhg))]

                sitepairs.append((self.length, 0))

                w2, c1 = sitepairs[0]

                newfrags.append(
                    Dseq(frag.watson[:w2],
                         frag.crick[c1:],
                         ovhg=frag.ovhg,
                         pos=frag.pos))

                for (w1, c2), (w2, c1) in zip(sitepairs[:-1], sitepairs[1:]):
                    newfrags.append(
                        Dseq(
                            frag.watson[w1:w2],
                            frag.crick[c1:c2],
                            ovhg=enz.ovhg,
                            pos=frag.pos + w1 - max(0, enz.ovhg),
                        ))

            frags = newfrags
            newfrags = []

        return tuple(frags)
Exemple #8
0
    def align(self, insert_size = False, 
                    path_to_exe = False, 
                    local_alns_path = ['alignments'], 
                    force = False, 
                    max_cpus = -1):


        if not path_to_exe:
            path_to_exe = _get_exe_path('bwa')

        # write genome sequence to a fasta file
        try:
            _os.makedirs('genome_sequences')
        except OSError:
            pass

        genome_fna = 'genome_sequences/%s.fna' % self.genome_id

        _SeqIO.write(_SeqRecord(_Seq(self.genome_sequence.tostring()), id = self.genome_id), 
                    genome_fna, 
                    'fasta')

        # make folder for alignments (BAMs)
        local_alns_path = _os.path.sep.join(local_alns_path)
        if not _os.path.exists(local_alns_path):
            _os.makedirs(local_alns_path)

        # make a subdir for this genome
        local_alns_path_genome = _os.path.sep.join([
                                local_alns_path, 
                                self.genome_id])
        if not _os.path.exists(local_alns_path_genome):
            _os.makedirs(local_alns_path_genome)


        max_processes = _decide_max_processes( max_cpus )


        e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.'

        assert hasattr(self, 'read_files'), e1

        e2 = 'Could not find %s. Either run trim() again or ensure file exists'

        for pairname, files in self.read_files.items():
            assert _os.path.exists(files[1]), e2 % files[1]
            assert _os.path.exists(files[2]), e2 % files[2]

        have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa')]

        if not all(have_index_files):
            print('Writing BWA index files for %s' % genome_fna)
            _subprocess.call([path_to_exe, 'index', genome_fna])


        aligned_read_files = {}
        for pairname,files in self.read_files.items():
            RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname,pairname)
            if insert_size:
                cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2]]
            else:
                # BWA can estimate on-the-fly
                cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2]]
            
            out_sam = _os.path.sep.join([local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id)])
            
            if not _os.path.exists(out_sam) or force:
                print('Called: "%s"' % ' '.join(cmd))
                with open(out_sam, "wb") as out:
                    _subprocess.call(cmd, stdout = out)
                
            else:
                print('Found:')
                print(out_sam)
                print('use "force = True" to overwrite')
            
            print(' '.join(cmd))
            
            aligned_read_files[pairname] = out_sam

        self.aligned_read_files = aligned_read_files
Exemple #9
0
    def align(self,
              insert_size=False,
              path_to_exe=False,
              local_alns_path=['alignments'],
              force=False,
              max_cpus=-1):

        if not path_to_exe:
            path_to_exe = _get_exe_path('bwa')

        # write genome sequence to a fasta file
        try:
            _os.makedirs('genome_sequences')
        except OSError:
            pass

        genome_fna = 'genome_sequences/%s.fna' % self.genome_id

        _SeqIO.write(
            _SeqRecord(_Seq(self.genome_sequence.tostring()),
                       id=self.genome_id), genome_fna, 'fasta')

        # make folder for alignments (BAMs)
        local_alns_path = _os.path.sep.join(local_alns_path)
        if not _os.path.exists(local_alns_path):
            _os.makedirs(local_alns_path)

        # make a subdir for this genome
        local_alns_path_genome = _os.path.sep.join(
            [local_alns_path, self.genome_id])
        if not _os.path.exists(local_alns_path_genome):
            _os.makedirs(local_alns_path_genome)

        max_processes = _decide_max_processes(max_cpus)

        e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.'

        assert hasattr(self, 'read_files'), e1

        e2 = 'Could not find %s. Either run trim() again or ensure file exists'

        for pairname, files in self.read_files.items():
            assert _os.path.exists(files[1]), e2 % files[1]
            assert _os.path.exists(files[2]), e2 % files[2]

        # always (re)index in case of upstream changes in data
        print('Writing BWA index files for %s' % genome_fna)
        _subprocess.call([path_to_exe, 'index', genome_fna])

        aligned_read_files = {}
        for pairname, files in self.read_files.items():
            RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname, pairname)
            if insert_size:
                cmd = [
                    path_to_exe, 'mem', '-t',
                    str(max_processes), '-M', '-a', '-I', insert_size, '-R',
                    RGinfo, genome_fna, files[1], files[2]
                ]
            else:
                # BWA can estimate on-the-fly
                cmd = [
                    path_to_exe, 'mem', '-t',
                    str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna,
                    files[1], files[2]
                ]

            out_sam = _os.path.sep.join([
                local_alns_path_genome,
                '%s__%s.sam' % (pairname, self.genome_id)
            ])

            if not _os.path.exists(out_sam) or force:
                print('Called: "%s"' % ' '.join(cmd))
                with open(out_sam, "wb") as out:
                    _subprocess.call(cmd, stdout=out)

            else:
                print('Found:')
                print(out_sam)
                print('use "force = True" to overwrite')

            print(' '.join(cmd))

            aligned_read_files[pairname] = out_sam

        self.aligned_read_files = aligned_read_files
Exemple #10
0
    def generateSequences(self):
        '''
        Create full length sequences with generated variants applied to the reference sequence.

        Generated variants are saved to a csv.

        If large deletions are present, variant positions appropriately are corrected when applied.
        '''

        save_these = {}
        save_these['SNPs'] = self.SNPs_per_genome
        save_these['InDels'] = self.indel_dict_by_pos_pergenome

        if len(self.large_deletions):
            # generate a version of reference genome with large deletions
            ranges = sorted(self.large_deletions.values())
            genome_large_deletions = self.genome.sequence[:ranges[0][0]]
            for n,(s,e) in enumerate(ranges[:-1]):
                genome_large_deletions.extend(self.genome.sequence[e:ranges[n+1][0]])
            
            genome_large_deletions.extend(self.genome.sequence[ranges[n+1][1]:])
            
            # adjust generated variant positions for geneome with deletions
            def adjust(pos0):
                offset = 0
                for s,e in ranges:
                    if pos0 > e:
                        offset += (e-s)
                return(pos0 - offset)
            
            # adjust the second half of the generated variants
            SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.num_individuals]
            for SNPs in self.SNPs_per_genome[self.num_individuals:]:
                adjusted = []
                for pos0,variant in SNPs:
                    adjusted += [(adjust(pos0),variant)]
                SNPs_per_genome_adjusted += [adjusted]
            
            indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:self.num_individuals]
            for indels in self.indel_dict_by_pos_pergenome[self.num_individuals:]:
                adjusted = {}
                for pos0,indel in indels.items():
                    adjusted[adjust(pos0)] = indel
                
                indel_dict_by_pos_pergenome_adjusted += [adjusted]
            
            save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted
            save_these['InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted
            SNPs_per_genome = SNPs_per_genome_adjusted
            indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted
        else:
            SNPs_per_genome = self.SNPs_per_genome
            indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome
            

        # adjusted are needed to apply the variants
        # unadjusted are needed to check the calling
        _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p','w'))
        # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r'))


        ### generate genotypes (apply variants) ###
        genotypes = []
        for gn,SNPs in enumerate(SNPs_per_genome):
            if len(self.large_deletions) and gn >= self.num_individuals:
                # use genome with large deletions for second batch
                orig_genome = genome_large_deletions
                genome = _array('c',genome_large_deletions)
            else:
                # else use original
                orig_genome = self.genome.sequence
                genome = _array('c',self.genome.sequence)
            
            # first SNPs
            for pos0,SNP in SNPs:
                assert genome[pos0] != SNP
                genome[pos0] = SNP
            
            # check it worked
            changed = [pos0 for pos0,(new,old) in enumerate(zip(genome,list(orig_genome))) if new != old]
            assert changed == [pos0 for pos0,var in SNPs], 'SNPs were not correctly applied . . .'
            # then indels
            newgenome = _array('c')
            last_pos0 = 0
            for pos0,indel in sorted(indel_dict_by_pos_pergenome[gn].items()):
              if isinstance(indel,str):
                # insertion
                newgenome.extend(genome[last_pos0:pos0])
                newgenome.extend(indel)
                last_pos0 = pos0
              else:
                # deletion
                newgenome.extend(genome[last_pos0:pos0])
                last_pos0 = pos0 + indel
            
            newgenome.extend(genome[last_pos0:])
            genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()), 
                    id = self.genome.id+'_sim{:02d}'.format(gn+1), name = '', description = '')
            genotypes += [genome_seqrecord]
            print(len(self.genome.sequence),len(genotypes[-1]),genotypes[-1].id)

        self.genotypes = genotypes
Exemple #11
0
    def generateSequences(self):
        '''
        Create full length sequences with generated variants applied to the reference sequence.

        Generated variants are saved to a csv.

        If large deletions are present, variant positions appropriately are corrected when applied.
        '''

        save_these = {}
        save_these['SNPs'] = self.SNPs_per_genome
        save_these['InDels'] = self.indel_dict_by_pos_pergenome

        if len(self.large_deletions):
            # generate a version of reference genome with large deletions
            ranges = sorted(self.large_deletions.values())
            genome_large_deletions = self.genome.sequence[:ranges[0][0]]
            for n, (s, e) in enumerate(ranges[:-1]):
                genome_large_deletions.extend(
                    self.genome.sequence[e:ranges[n + 1][0]])

            genome_large_deletions.extend(self.genome.sequence[ranges[n +
                                                                      1][1]:])

            # adjust generated variant positions for geneome with deletions
            def adjust(pos0):
                offset = 0
                for s, e in ranges:
                    if pos0 > e:
                        offset += (e - s)
                return (pos0 - offset)

            # adjust the second half of the generated variants
            SNPs_per_genome_adjusted = self.SNPs_per_genome[:self.
                                                            num_individuals]
            for SNPs in self.SNPs_per_genome[self.num_individuals:]:
                adjusted = []
                for pos0, variant in SNPs:
                    adjusted += [(adjust(pos0), variant)]
                SNPs_per_genome_adjusted += [adjusted]

            indel_dict_by_pos_pergenome_adjusted = self.indel_dict_by_pos_pergenome[:
                                                                                    self
                                                                                    .
                                                                                    num_individuals]
            for indels in self.indel_dict_by_pos_pergenome[self.
                                                           num_individuals:]:
                adjusted = {}
                for pos0, indel in indels.items():
                    adjusted[adjust(pos0)] = indel

                indel_dict_by_pos_pergenome_adjusted += [adjusted]

            save_these['SNPs_adjusted'] = SNPs_per_genome_adjusted
            save_these[
                'InDels_adjusted'] = indel_dict_by_pos_pergenome_adjusted
            SNPs_per_genome = SNPs_per_genome_adjusted
            indel_dict_by_pos_pergenome = indel_dict_by_pos_pergenome_adjusted
        else:
            SNPs_per_genome = self.SNPs_per_genome
            indel_dict_by_pos_pergenome = self.indel_dict_by_pos_pergenome

        # adjusted are needed to apply the variants
        # unadjusted are needed to check the calling
        _cPickle.dump(save_these, open('baga.GemSIM_known_variants.p', 'w'))
        # save_these = cPickle.load(open('baga.GemSIM_known_variants.p','r'))

        ### generate genotypes (apply variants) ###
        genotypes = []
        for gn, SNPs in enumerate(SNPs_per_genome):
            if len(self.large_deletions) and gn >= self.num_individuals:
                # use genome with large deletions for second batch
                orig_genome = genome_large_deletions
                genome = _array('c', genome_large_deletions)
            else:
                # else use original
                orig_genome = self.genome.sequence
                genome = _array('c', self.genome.sequence)

            # first SNPs
            for pos0, SNP in SNPs:
                assert genome[pos0] != SNP
                genome[pos0] = SNP

            # check it worked
            changed = [
                pos0
                for pos0, (new,
                           old) in enumerate(zip(genome, list(orig_genome)))
                if new != old
            ]
            assert changed == [pos0 for pos0, var in SNPs
                               ], 'SNPs were not correctly applied . . .'
            # then indels
            newgenome = _array('c')
            last_pos0 = 0
            for pos0, indel in sorted(indel_dict_by_pos_pergenome[gn].items()):
                if isinstance(indel, str):
                    # insertion
                    newgenome.extend(genome[last_pos0:pos0])
                    newgenome.extend(indel)
                    last_pos0 = pos0
                else:
                    # deletion
                    newgenome.extend(genome[last_pos0:pos0])
                    last_pos0 = pos0 + indel

            newgenome.extend(genome[last_pos0:])
            genome_seqrecord = _SeqRecord(_Seq(newgenome.tostring()),
                                          id=self.genome.id +
                                          '_sim{:02d}'.format(gn + 1),
                                          name='',
                                          description='')
            genotypes += [genome_seqrecord]
            print(len(self.genome.sequence), len(genotypes[-1]),
                  genotypes[-1].id)

        self.genotypes = genotypes