def Variants(self, chromosome, start, end): """ Generator funtion. Yields variants in order of genomic co-ordinate. """ vcfLines = None varList = [] maxSize = self.options.maxSize for vcfFile in self.vcfFiles: try: vcfLines = vcfFile.fetch(chromosome, start, end, parser=ctabix.asVCF()) except Exception as e: logger.warning( "Could not retrieve variants from source file in region %s:%s-%s. Error was %s" % (chromosome, start, end, e)) continue for line in vcfLines: if not isValidVcfLine(line): continue # Get the components of the VCF line chrom = line.contig pos = line.pos ref = line.ref altCol = line.alt alts = altCol.split(",") lenRef = len(ref) for alt in alts: lenAlt = len(alt) varSize = abs(lenAlt - lenRef) if varSize > maxSize: logger.debug( "Skipping large variant of size %s in source file. Maximum allowed variant size is %s" % (varSize, maxSize)) continue # SNP if lenRef == 1 and lenAlt == 1: var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR) varList.append(var) # MNP elif lenRef == lenAlt: # MNPs may leading and/or trailing bases trimming #var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR) #varList.append(var) tempRef = ref tempAlt = alt tempPos = pos removed = tempRef added = tempAlt # Trim leading bases while len(tempRef) > 0 and len( tempAlt) > 0 and tempRef[0] == tempAlt[0]: tempRef = tempRef[1:] tempAlt = tempAlt[1:] removed = tempRef added = tempAlt tempPos += 1 # Trim trailing bases while len(tempRef) > 0 and len( tempAlt) > 0 and tempRef[-1] == tempAlt[-1]: tempRef = tempRef[:-1] tempAlt = tempAlt[:-1] removed = tempRef added = tempAlt var = Variant(chromosome, tempPos, removed, added, 0, FILE_VAR) varList.append(var) # Anything else else: if self.options.longHaps == 1: var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR) varList.append(var) continue # VCF4 is -1 indexed for indels, so trim off first base tempRef = ref[1:] tempAlt = alt[1:] tempPos = pos removed = tempRef added = tempAlt # Trim the matching bits off and shift position. This will decompose # multi-variant sites into individual alleles at different positions. while len(tempRef) > 0 and len( tempAlt) > 0 and tempRef[0] == tempAlt[0]: tempRef = tempRef[1:] tempAlt = tempAlt[1:] removed = tempRef added = tempAlt tempPos += 1 # Skip weird cases for now #if len(removed) != 0 and len(added) != 0: # continue #logger.error("Dodgy variant found at %s:%s, with ref=%s, alt = %s" %(chrom,pos,ref,alt)) #logger.error("This will probably break something later on...") var = Variant(chromosome, tempPos, removed, added, 0, FILE_VAR) varList.append(var) varList = sorted(list(set(varList))) logger.debug("Found %s variants in region %s in source file" % (len(varList), "%s:%s-%s" % (chromosome, start, end))) return varList
def Variants(self, chromosome, start, end): """ Generator funtion. Yields variants in order of genomic co-ordinate. """ vcfLines = None varList = [] maxSize = self.options.maxSize for vcfFile in self.vcfFiles: try: vcfLines = vcfFile.fetch(chromosome, start, end, parser=ctabix.asVCF()) except Exception, e: logger.warning("Could not retrieve variants from source file in region %s:%s-%s. Error was %s" %(chromosome,start,end,e)) continue for line in vcfLines: if not isValidVcfLine(line): continue # Get the components of the VCF line chrom = line.contig pos = line.pos ref = line.ref altCol = line.alt alts = altCol.split(",") lenRef = len(ref) for alt in alts: lenAlt = len(alt) varSize = abs(lenAlt - lenRef) if varSize > maxSize: logger.debug("Skipping large variant of size %s in source file. Maximum allowed variant size is %s" %(varSize, maxSize)) continue # SNP if lenRef == 1 and lenAlt == 1: var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR) varList.append(var) # MNP elif lenRef == lenAlt: # MNPs may leading and/or trailing bases trimming #var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR) #varList.append(var) tempRef = ref tempAlt = alt tempPos = pos removed = tempRef added = tempAlt # Trim leading bases while len(tempRef) > 0 and len(tempAlt) > 0 and tempRef[0] == tempAlt[0]: tempRef = tempRef[1:] tempAlt = tempAlt[1:] removed = tempRef added = tempAlt tempPos +=1 # Trim trailing bases while len(tempRef) > 0 and len(tempAlt) > 0 and tempRef[-1] == tempAlt[-1]: tempRef = tempRef[:-1] tempAlt = tempAlt[:-1] removed = tempRef added = tempAlt var = Variant(chromosome, tempPos, removed, added, 0, FILE_VAR) varList.append(var) # Anything else else: # VCF4 is -1 indexed for indels, so trim off first base tempRef = ref[1:] tempAlt = alt[1:] tempPos = pos removed = tempRef added = tempAlt # Trim the matching bits off and shift position. This will decompose # multi-variant sites into individual alleles at different positions. while len(tempRef) > 0 and len(tempAlt) > 0 and tempRef[0] == tempAlt[0]: tempRef = tempRef[1:] tempAlt = tempAlt[1:] removed = tempRef added = tempAlt tempPos +=1 # Skip weird cases for now #if len(removed) != 0 and len(added) != 0: # continue #logger.error("Dodgy variant found at %s:%s, with ref=%s, alt = %s" %(chrom,pos,ref,alt)) #logger.error("This will probably break something later on...") var = Variant(chromosome, tempPos, removed, added, 0, FILE_VAR) varList.append(var)