Exemple #1
0
    def Variants(self, chromosome, start, end):
        """
        Generator funtion. Yields variants in order of
        genomic co-ordinate.
        """
        vcfLines = None
        varList = []
        maxSize = self.options.maxSize

        for vcfFile in self.vcfFiles:
            try:
                vcfLines = vcfFile.fetch(chromosome,
                                         start,
                                         end,
                                         parser=ctabix.asVCF())
            except Exception as e:
                logger.warning(
                    "Could not retrieve variants from source file in region %s:%s-%s. Error was %s"
                    % (chromosome, start, end, e))
                continue

            for line in vcfLines:

                if not isValidVcfLine(line):
                    continue

                # Get the components of the VCF line
                chrom = line.contig
                pos = line.pos
                ref = line.ref
                altCol = line.alt
                alts = altCol.split(",")

                lenRef = len(ref)

                for alt in alts:
                    lenAlt = len(alt)
                    varSize = abs(lenAlt - lenRef)

                    if varSize > maxSize:
                        logger.debug(
                            "Skipping large variant of size %s in source file. Maximum allowed variant size is %s"
                            % (varSize, maxSize))
                        continue

                    # SNP
                    if lenRef == 1 and lenAlt == 1:
                        var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR)
                        varList.append(var)

                    # MNP
                    elif lenRef == lenAlt:
                        # MNPs may leading and/or trailing bases trimming
                        #var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR)
                        #varList.append(var)

                        tempRef = ref
                        tempAlt = alt
                        tempPos = pos
                        removed = tempRef
                        added = tempAlt

                        # Trim leading bases
                        while len(tempRef) > 0 and len(
                                tempAlt) > 0 and tempRef[0] == tempAlt[0]:
                            tempRef = tempRef[1:]
                            tempAlt = tempAlt[1:]
                            removed = tempRef
                            added = tempAlt
                            tempPos += 1

                        # Trim trailing bases
                        while len(tempRef) > 0 and len(
                                tempAlt) > 0 and tempRef[-1] == tempAlt[-1]:
                            tempRef = tempRef[:-1]
                            tempAlt = tempAlt[:-1]
                            removed = tempRef
                            added = tempAlt

                        var = Variant(chromosome, tempPos, removed, added, 0,
                                      FILE_VAR)
                        varList.append(var)

                    # Anything else
                    else:
                        if self.options.longHaps == 1:
                            var = Variant(chromosome, pos, ref, alt, 0,
                                          FILE_VAR)
                            varList.append(var)
                            continue

                        # VCF4 is -1 indexed for indels, so trim off first base
                        tempRef = ref[1:]
                        tempAlt = alt[1:]
                        tempPos = pos
                        removed = tempRef
                        added = tempAlt

                        # Trim the matching bits off and shift position. This will decompose
                        # multi-variant sites into individual alleles at different positions.
                        while len(tempRef) > 0 and len(
                                tempAlt) > 0 and tempRef[0] == tempAlt[0]:
                            tempRef = tempRef[1:]
                            tempAlt = tempAlt[1:]
                            removed = tempRef
                            added = tempAlt
                            tempPos += 1

                        # Skip weird cases for now
                        #if len(removed) != 0 and len(added) != 0:
                        #    continue
                        #logger.error("Dodgy variant found at %s:%s, with ref=%s, alt = %s" %(chrom,pos,ref,alt))
                        #logger.error("This will probably break something later on...")

                        var = Variant(chromosome, tempPos, removed, added, 0,
                                      FILE_VAR)
                        varList.append(var)

        varList = sorted(list(set(varList)))
        logger.debug("Found %s variants in region %s in source file" %
                     (len(varList), "%s:%s-%s" % (chromosome, start, end)))
        return varList
Exemple #2
0
    def Variants(self, chromosome, start, end):
        """
        Generator funtion. Yields variants in order of
        genomic co-ordinate.
        """
        vcfLines = None
        varList = []
        maxSize = self.options.maxSize

        for vcfFile in self.vcfFiles:
            try:
                vcfLines = vcfFile.fetch(chromosome, start, end, parser=ctabix.asVCF())
            except Exception, e:
                logger.warning("Could not retrieve variants from source file in region %s:%s-%s. Error was %s" %(chromosome,start,end,e))
                continue

            for line in vcfLines:
                
                if not isValidVcfLine(line):
                    continue
                
                # Get the components of the VCF line
                chrom = line.contig
                pos = line.pos
                ref = line.ref
                altCol = line.alt
                alts = altCol.split(",")

                lenRef = len(ref)

                for alt in alts:
                    lenAlt = len(alt)
                    varSize = abs(lenAlt - lenRef)

                    if varSize > maxSize:
                        logger.debug("Skipping large variant of size %s in source file. Maximum allowed variant size is %s" %(varSize, maxSize))
                        continue

                    # SNP
                    if lenRef == 1 and lenAlt == 1:
                        var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR)
                        varList.append(var)

                    # MNP
                    elif lenRef == lenAlt:
                        # MNPs may leading and/or trailing bases trimming
                        #var = Variant(chromosome, pos, ref, alt, 0, FILE_VAR)
                        #varList.append(var)

                        tempRef = ref
                        tempAlt = alt
                        tempPos = pos
                        removed = tempRef
                        added = tempAlt

                        # Trim leading bases
                        while len(tempRef) > 0 and len(tempAlt) > 0 and tempRef[0] == tempAlt[0]:
                            tempRef = tempRef[1:]
                            tempAlt = tempAlt[1:]
                            removed = tempRef
                            added = tempAlt
                            tempPos +=1

                        # Trim trailing bases
                        while len(tempRef) > 0 and len(tempAlt) > 0 and tempRef[-1] == tempAlt[-1]:
                            tempRef = tempRef[:-1]
                            tempAlt = tempAlt[:-1]
                            removed = tempRef
                            added = tempAlt

                        var = Variant(chromosome, tempPos, removed, added, 0, FILE_VAR)
                        varList.append(var)

                    # Anything else
                    else:

                        # VCF4 is -1 indexed for indels, so trim off first base
                        tempRef = ref[1:]
                        tempAlt = alt[1:]
                        tempPos = pos
                        removed = tempRef
                        added = tempAlt

                        # Trim the matching bits off and shift position. This will decompose
                        # multi-variant sites into individual alleles at different positions.
                        while len(tempRef) > 0 and len(tempAlt) > 0 and tempRef[0] == tempAlt[0]:
                            tempRef = tempRef[1:]
                            tempAlt = tempAlt[1:]
                            removed = tempRef
                            added = tempAlt
                            tempPos +=1

                        # Skip weird cases for now
                        #if len(removed) != 0 and len(added) != 0:
                        #    continue
                            #logger.error("Dodgy variant found at %s:%s, with ref=%s, alt = %s" %(chrom,pos,ref,alt))
                            #logger.error("This will probably break something later on...")

                        var = Variant(chromosome, tempPos, removed, added, 0, FILE_VAR)
                        varList.append(var)