Beispiel #1
0
    def __fill_cD(self, iL=None, snpL=None):
        """Fill *self.cF*.

        Fill *self.cF* with data from reference at chromosome
        *self.chrom* and position *self.pos*. Possible SNPs in
        *self.vcfL* at this position are considered.

        :param [int] iL: List with vcf indices of the SNPs in *snpL*,
            must be sorted.
        :param [NucBase] snpL: List with :class:`NucBase
            <cflib.vcf.NucBase>` SNPs at this position. None, if
            there is no SNP.
        :raises: :class:`NotAValidRefBase
            <cflib.seqbase.NotAValidRefBase>`,
            :class:`SequenceDataError
            <cflib.seqbase.SequenceDataError>`

        :class:`NotAValidRefBae <cflib.seqbase.NotAValidRefBase>` is
        raised if the reference base is not valid (e.g. N).

        :class:`SequenceDataError <cflib.seqbase.SequenceDataError>`
        is raised if the chromosome names do not match.

        """
        if snpL is not None:
            logging.debug("Next SNP(s):")
            for s in snpL:
                logging.debug(s.get_info())

        def get_refBase():
            """Get reference base on *chrom* at *pos*."""
            return self.refSeq.data[self.pos].lower()

        def update_cD(pop, baseI, delta=self.ploidy):
            """Add counts to the countsDictionary cD."""
            # FIXME: IUPAC code not handled here.  Is this even necessary?
            if baseI == dna['n'] or baseI == dna['*']:
                logging.debug("Reference base is unknown.  Continue.")
                return
            if pop in range(0, self.nPop):
                self.cD[pop][baseI] += delta
                logging.debug(
                    "Updating counts dictionary; population %s, "
                    "base index %s.", pop, baseI)
            else:
                logging.info(
                    "Ignoring data because population index %s is "
                    "out of range.", pop)
                raise ValueError()

        self.purge_cD()

        # If we check for synonymous bases, do not do anything if base
        # is not 4-fold degenerate.
        if self.onlySynonymous is True:
            if self.refSeq.is_synonymous(self.pos) is False:
                logging.debug(
                    "Rejection; %s at position %s "
                    "is not a synonymous base.", self.refSeq.data[self.pos],
                    self.pos)
                raise NoSynBase()

        refBase = get_refBase()
        try:
            r = dna[refBase]
        except KeyError:
            raise sb.NotAValidRefBase()
        # If there are no SNPS, fill *self.cD* with data from reference.
        if iL is None:
            for i in range(self.nV):
                for pop in self.assM[i]:
                    update_cD(pop, r)
        elif (snpL is not None) and (len(iL) == len(snpL)):
            # Else, only fill *self.cD* where the individual has no SNP.
            for i in range(self.nV):
                if i not in iL:
                    for pop in self.assM[i]:
                        update_cD(pop, r)
            # Now traverse the SNPs.
            for sI in range(len(iL)):
                # Check if the reference bases match.
                vcfRefBase = snpL[sI].get_ref_base().lower()
                # Thu Jun 9 09:26:55 CEST 2016: Just use first base if
                # there are more.
                indel = False
                if len(vcfRefBase) > 1:
                    logging.warn("Indel at chrom %s pos %d.", self.chrom,
                                 self.pos + self.offset)
                    indel = True
                    vcfRefBase = vcfRefBase[0]
                if dna[vcfRefBase] != r:
                    print("Error at NucBase:")
                    snpL[sI].print_info()
                    print("The reference base at position",
                          self.pos,
                          "on chromosome",
                          self.chrom,
                          "is",
                          refBase,
                          end=".\n")
                    print("The reference base of the VCF file is",
                          vcfRefBase,
                          end=".\n")
                    raise sb.SequenceDataError("Reference bases do not match.")
                altBases = snpL[sI].get_alt_base_list()
                for altBase in altBases:
                    if len(altBase) > 1:
                        indel = True
                        logging.warn("Indel at chrom %s pos %d.", self.chrom,
                                     self.pos + self.offset)
                spData = snpL[sI].get_speciesData()
                vI = iL[sI]
                # Loop over individuals.
                for i in range(0, len(spData)):
                    # Loop over chromatides (e.g. diploid).
                    for d in range(0, self.ploidy):
                        if spData[i][d] is None:
                            pass
                        elif indel or spData[i][d] == 0:
                            bI = r
                            update_cD(self.assM[vI][i], bI, delta=1)
                        else:
                            bI = dna[altBases[spData[i][d] - 1]]
                            logging.debug("Use SNP of %s, population %s",
                                          self.indM[vI][i], self.assM[vI][i])
                            update_cD(self.assM[vI][i], bI, delta=1)
        else:
            raise sb.SequenceDataError("SNP information is not correct.")
Beispiel #2
0
    def add_base_to_sequence(self,
                             pop_id,
                             base_char,
                             double_fixed_sites=False):
        """Adds the base given in `base_char` to the counts of population with
        id `pop_id`.  If `double_fixed_sited` is true, fixed sites are
        counted twice.  This makes sense, when heterozygotes are
        encoded with IUPAC codes.

        """
        base = base_char.lower()
        try:
            base_id = dna[base]
        except KeyError:
            raise sb.NotAValidRefBase()
        # Honor IUPAC code.
        if base_id <= 3:
            self.cD[pop_id][base_id] += 1
            if double_fixed_sites:
                self.cD[pop_id][base_id] += 1
            return
        elif base == 'r':
            # C or G.
            self.cD[pop_id][0] += 1
            self.cD[pop_id][2] += 1
        elif base == 'y':
            # C or T.
            self.cD[pop_id][1] += 1
            self.cD[pop_id][3] += 1
        elif base == 's':
            # G or C.
            self.cD[pop_id][1] += 1
            self.cD[pop_id][2] += 1
        elif base == 'w':
            # A or T.
            self.cD[pop_id][0] += 1
            self.cD[pop_id][3] += 1
        elif base == 'k':
            # G or T.
            self.cD[pop_id][2] += 1
            self.cD[pop_id][3] += 1
        elif base == 'm':
            # A or C.
            self.cD[pop_id][0] += 1
            self.cD[pop_id][1] += 1
        elif base == 'b':
            # C or G or T.
            logging.info("Ambivalent base with 3 possibilities.")
            logging.info("This base will be ignored upon running PoMo.")
            self.cD[pop_id][1] += 1
            self.cD[pop_id][2] += 1
            self.cD[pop_id][3] += 1
        elif base == 'd':
            # A or G or T.
            logging.info("Ambivalent base with 3 possibilities.")
            logging.info("This base will be ignored upon running PoMo.")
            self.cD[pop_id][0] += 1
            self.cD[pop_id][2] += 1
            self.cD[pop_id][3] += 1
        elif base == 'h':
            # A or C or T.
            logging.info("Ambivalent base with 3 possibilities.")
            logging.info("This base will be ignored upon running PoMo.")
            self.cD[pop_id][0] += 1
            self.cD[pop_id][1] += 1
            self.cD[pop_id][3] += 1
        elif base == 'v':
            # A or C or G.
            logging.info("Ambivalent base with 3 possibilities.")
            logging.info("This base will be ignored upon running PoMo.")
            self.cD[pop_id][0] += 1
            self.cD[pop_id][1] += 1
            self.cD[pop_id][2] += 1
        elif base == 'n':
            # Any base.
            pass
        elif base == '-' or base == '.':
            # Gap.
            pass
        logging.info("IUPAC code handled.  This might bias the analysis.")
        return