def __fill_cD(self, iL=None, snpL=None): """Fill *self.cF*. Fill *self.cF* with data from reference at chromosome *self.chrom* and position *self.pos*. Possible SNPs in *self.vcfL* at this position are considered. :param [int] iL: List with vcf indices of the SNPs in *snpL*, must be sorted. :param [NucBase] snpL: List with :class:`NucBase <cflib.vcf.NucBase>` SNPs at this position. None, if there is no SNP. :raises: :class:`NotAValidRefBase <cflib.seqbase.NotAValidRefBase>`, :class:`SequenceDataError <cflib.seqbase.SequenceDataError>` :class:`NotAValidRefBae <cflib.seqbase.NotAValidRefBase>` is raised if the reference base is not valid (e.g. N). :class:`SequenceDataError <cflib.seqbase.SequenceDataError>` is raised if the chromosome names do not match. """ if snpL is not None: logging.debug("Next SNP(s):") for s in snpL: logging.debug(s.get_info()) def get_refBase(): """Get reference base on *chrom* at *pos*.""" return self.refSeq.data[self.pos].lower() def update_cD(pop, baseI, delta=self.ploidy): """Add counts to the countsDictionary cD.""" # FIXME: IUPAC code not handled here. Is this even necessary? if baseI == dna['n'] or baseI == dna['*']: logging.debug("Reference base is unknown. Continue.") return if pop in range(0, self.nPop): self.cD[pop][baseI] += delta logging.debug( "Updating counts dictionary; population %s, " "base index %s.", pop, baseI) else: logging.info( "Ignoring data because population index %s is " "out of range.", pop) raise ValueError() self.purge_cD() # If we check for synonymous bases, do not do anything if base # is not 4-fold degenerate. if self.onlySynonymous is True: if self.refSeq.is_synonymous(self.pos) is False: logging.debug( "Rejection; %s at position %s " "is not a synonymous base.", self.refSeq.data[self.pos], self.pos) raise NoSynBase() refBase = get_refBase() try: r = dna[refBase] except KeyError: raise sb.NotAValidRefBase() # If there are no SNPS, fill *self.cD* with data from reference. if iL is None: for i in range(self.nV): for pop in self.assM[i]: update_cD(pop, r) elif (snpL is not None) and (len(iL) == len(snpL)): # Else, only fill *self.cD* where the individual has no SNP. for i in range(self.nV): if i not in iL: for pop in self.assM[i]: update_cD(pop, r) # Now traverse the SNPs. for sI in range(len(iL)): # Check if the reference bases match. vcfRefBase = snpL[sI].get_ref_base().lower() # Thu Jun 9 09:26:55 CEST 2016: Just use first base if # there are more. indel = False if len(vcfRefBase) > 1: logging.warn("Indel at chrom %s pos %d.", self.chrom, self.pos + self.offset) indel = True vcfRefBase = vcfRefBase[0] if dna[vcfRefBase] != r: print("Error at NucBase:") snpL[sI].print_info() print("The reference base at position", self.pos, "on chromosome", self.chrom, "is", refBase, end=".\n") print("The reference base of the VCF file is", vcfRefBase, end=".\n") raise sb.SequenceDataError("Reference bases do not match.") altBases = snpL[sI].get_alt_base_list() for altBase in altBases: if len(altBase) > 1: indel = True logging.warn("Indel at chrom %s pos %d.", self.chrom, self.pos + self.offset) spData = snpL[sI].get_speciesData() vI = iL[sI] # Loop over individuals. for i in range(0, len(spData)): # Loop over chromatides (e.g. diploid). for d in range(0, self.ploidy): if spData[i][d] is None: pass elif indel or spData[i][d] == 0: bI = r update_cD(self.assM[vI][i], bI, delta=1) else: bI = dna[altBases[spData[i][d] - 1]] logging.debug("Use SNP of %s, population %s", self.indM[vI][i], self.assM[vI][i]) update_cD(self.assM[vI][i], bI, delta=1) else: raise sb.SequenceDataError("SNP information is not correct.")
def add_base_to_sequence(self, pop_id, base_char, double_fixed_sites=False): """Adds the base given in `base_char` to the counts of population with id `pop_id`. If `double_fixed_sited` is true, fixed sites are counted twice. This makes sense, when heterozygotes are encoded with IUPAC codes. """ base = base_char.lower() try: base_id = dna[base] except KeyError: raise sb.NotAValidRefBase() # Honor IUPAC code. if base_id <= 3: self.cD[pop_id][base_id] += 1 if double_fixed_sites: self.cD[pop_id][base_id] += 1 return elif base == 'r': # C or G. self.cD[pop_id][0] += 1 self.cD[pop_id][2] += 1 elif base == 'y': # C or T. self.cD[pop_id][1] += 1 self.cD[pop_id][3] += 1 elif base == 's': # G or C. self.cD[pop_id][1] += 1 self.cD[pop_id][2] += 1 elif base == 'w': # A or T. self.cD[pop_id][0] += 1 self.cD[pop_id][3] += 1 elif base == 'k': # G or T. self.cD[pop_id][2] += 1 self.cD[pop_id][3] += 1 elif base == 'm': # A or C. self.cD[pop_id][0] += 1 self.cD[pop_id][1] += 1 elif base == 'b': # C or G or T. logging.info("Ambivalent base with 3 possibilities.") logging.info("This base will be ignored upon running PoMo.") self.cD[pop_id][1] += 1 self.cD[pop_id][2] += 1 self.cD[pop_id][3] += 1 elif base == 'd': # A or G or T. logging.info("Ambivalent base with 3 possibilities.") logging.info("This base will be ignored upon running PoMo.") self.cD[pop_id][0] += 1 self.cD[pop_id][2] += 1 self.cD[pop_id][3] += 1 elif base == 'h': # A or C or T. logging.info("Ambivalent base with 3 possibilities.") logging.info("This base will be ignored upon running PoMo.") self.cD[pop_id][0] += 1 self.cD[pop_id][1] += 1 self.cD[pop_id][3] += 1 elif base == 'v': # A or C or G. logging.info("Ambivalent base with 3 possibilities.") logging.info("This base will be ignored upon running PoMo.") self.cD[pop_id][0] += 1 self.cD[pop_id][1] += 1 self.cD[pop_id][2] += 1 elif base == 'n': # Any base. pass elif base == '-' or base == '.': # Gap. pass logging.info("IUPAC code handled. This might bias the analysis.") return