Example #1
0
    def _extract(self, chromvariant, locset, num_type_dict):
        clone = ChromVariants(chromvariant.chrom, chromvariant._max_indel_len)

        for loc in locset:
            var = chromvariant.all_variants[loc]
            clone._add_variant(var)
            num_type_dict[var.var_type] += 1
        clone._ensure_sorted()

        return clone
Example #2
0
    def _extract(self,chromvariant,locset,num_type_dict):
        clone = ChromVariants(chromvariant.chrom, chromvariant._max_indel_len)

        for loc in locset:
            var = chromvariant.all_variants[loc]
            clone._add_variant(var)
            num_type_dict[var.var_type] += 1
        clone._ensure_sorted()

        return clone
Example #3
0
class ChromVariantStats:
    """Stats for a certain contig's worth of variants."""
    def __init__(self, true_var, pred_var, true_positives, false_positives,
                 false_negatives, concordance):
        self.chrom = true_var.chrom
        self.true_var = true_var
        self.pred_var = pred_var
        self.num_true = dict(
            map(lambda t: [t, true_var.var_num(t)], VARIANT_TYPE))
        self.num_pred = dict(
            map(lambda t: [t, pred_var.var_num(t)], VARIANT_TYPE))
        self.num_tp = _type_dict()  # true positives as int
        self.num_fp = _type_dict()
        self.num_fn = _type_dict()
        self.false_positives = self._extract(pred_var, false_positives,
                                             self.num_fp)  # chromvariants
        self.false_negatives = self._extract(true_var, false_negatives,
                                             self.num_fn)  # chromvariants
        self.true_positives = self._extract(true_var, true_positives,
                                            self.num_tp)  # chromvariants
        self.rescued_vars = ChromVariants(
            self.chrom,
            self.false_positives._max_indel_len)  # populate with rescued vars
        self.intersect_bad = None  # set externally
        self.known_fp = None  # set externally
        self.calls_at_known_fp = None  # set externally
        self.known_fp_variants = None  # set externally
        self.genotype_concordance = concordance
        # for loc in true_positives:
        #     var = true_var.all_variants[loc]
        #     self.num_tp[var.var_type] += 1

    def _extract(self, chromvariant, locset, num_type_dict):
        clone = ChromVariants(chromvariant.chrom, chromvariant._max_indel_len)

        for loc in locset:
            var = chromvariant.all_variants[loc]
            clone._add_variant(var)
            num_type_dict[var.var_type] += 1
        clone._ensure_sorted()

        return clone

    def rectify(self, ref, window):
        """Rescue variants from VCF ambiguity.

        Given reference genome and window of sequence comparison,
        fix each error estimated to be an artifact of VCF ambiguity.

        Calling this function strictly improves realism of evaluation,
        but doesn't perfectly detect ambiguity.
        TODO: Don't do redundant calculation for overlapping windows
        TODO: This should return two bools: events match, and genotypes match

        Note: Here the window is a single integer for the size. Here it gets
        converted specifically to an interval, and that's its subsequent usage in the stack.
        """

        locs_to_rescue = list(
            map(lambda loc: loc, self.false_negatives.all_locations))
        # note we needed to force a copy here, since rescue_mission is modifying the false-negative sets
        for loc in locs_to_rescue:
            if (loc in self.false_negatives.all_variants
                ):  # if the element is still in the set of false negatives
                new_tp, rm_fp, rescued_vars = rescue_mission(
                    self.false_negatives, self.false_positives,
                    self.true_positives, loc, ref, window)
                for t in VARIANT_TYPE:
                    # seemingly odd accounting. The number of predicted variants *changes* as a result of rescuing.
                    # e.g. 2 predicted FPs are in fact 1 FN. So
                    #  -- remove 2 predicted variants
                    #  -- remove 2 false positives
                    #  -- remove 1 false negative
                    #  -- add 1 true positive
                    self.num_pred[t] -= rm_fp[t]
                    self.num_fp[t] -= rm_fp[t]
                    self.num_pred[t] += new_tp[t]
                    self.num_fn[t] -= new_tp[t]
                    self.num_tp[t] += new_tp[t]
                for v in rescued_vars:
                    self.rescued_vars._add_variant(v)

    def _nrd_counts(self, var_type):
        genoGenoCounts = self.genotype_concordance[var_type]
        nWrong = genoGenoCounts[GENOTYPE_TYPE.HOM_REF][GENOTYPE_TYPE.HET]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_REF][GENOTYPE_TYPE.HOM_VAR]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_REF]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_VAR]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_REF]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HET]
        nTotal = nWrong + genoGenoCounts[GENOTYPE_TYPE.HET][
            GENOTYPE_TYPE.HET] + genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][
                GENOTYPE_TYPE.HOM_VAR]
        return (nWrong, nTotal)

    def to_dict(self, var_type):
        stats = {}
        stats['num_true'] = self.num_true[var_type]
        stats['num_pred'] = self.num_pred[var_type]
        stats['false_positives'] = self.num_fp[var_type]
        stats['false_negatives'] = self.num_fn[var_type]
        stats['good_predictions'] = self.num_tp[var_type]
        stats['intersect_bad'] = len(self.intersect_bad[var_type])
        nrd_wrong, nrd_total = self._nrd_counts(var_type)
        stats['nrd_wrong'] = nrd_wrong
        stats['nrd_total'] = nrd_total
        stats['known_fp_calls'] = self.calls_at_known_fp[
            var_type] if self.calls_at_known_fp else 0
        stats['known_fp'] = self.known_fp[
            var_type] if self.calls_at_known_fp else 0
        return stats
Example #4
0
class ChromVariantStats:

    """Stats for a certain contig's worth of variants."""

    def __init__(self, true_var, pred_var, true_positives,
               false_positives, false_negatives,concordance):
        self.chrom = true_var.chrom
        self.true_var = true_var
        self.pred_var = pred_var
        self.num_true = dict(map(lambda t: [t,true_var.var_num(t)],VARIANT_TYPE))
        self.num_pred = dict(map(lambda t: [t,pred_var.var_num(t)],VARIANT_TYPE))
        self.num_tp = _type_dict() # true positives as int
        self.num_fp = _type_dict()
        self.num_fn = _type_dict()
        self.false_positives = self._extract(pred_var,false_positives,self.num_fp) # chromvariants
        self.false_negatives = self._extract(true_var,false_negatives,self.num_fn) # chromvariants
        self.true_positives = self._extract(true_var,true_positives,self.num_tp) # chromvariants
        self.rescued_vars = ChromVariants(self.chrom,self.false_positives._max_indel_len) # populate with rescued vars
        self.intersect_bad = None # set externally
        self.known_fp = None # set externally
        self.calls_at_known_fp = None # set externally
        self.known_fp_variants = None # set externally
        self.genotype_concordance = concordance
        # for loc in true_positives:
        #     var = true_var.all_variants[loc]
        #     self.num_tp[var.var_type] += 1

    def _extract(self,chromvariant,locset,num_type_dict):
        clone = ChromVariants(chromvariant.chrom, chromvariant._max_indel_len)

        for loc in locset:
            var = chromvariant.all_variants[loc]
            clone._add_variant(var)
            num_type_dict[var.var_type] += 1
        clone._ensure_sorted()

        return clone

    def rectify(self, ref, window):
        """Rescue variants from VCF ambiguity.

        Given reference genome and window of sequence comparison,
        fix each error estimated to be an artifact of VCF ambiguity.

        Calling this function strictly improves realism of evaluation,
        but doesn't perfectly detect ambiguity.
        TODO: Don't do redundant calculation for overlapping windows
        TODO: This should return two bools: events match, and genotypes match

        Note: Here the window is a single integer for the size. Here it gets
        converted specifically to an interval, and that's its subsequent usage in the stack.
        """

        locs_to_rescue = list(map(lambda loc: loc, self.false_negatives.all_locations))
        # note we needed to force a copy here, since rescue_mission is modifying the false-negative sets
        for loc in locs_to_rescue:
            if ( loc in self.false_negatives.all_variants ): # if the element is still in the set of false negatives
                new_tp,rm_fp,rescued_vars = rescue_mission(self.false_negatives,self.false_positives,self.true_positives,loc,ref,window)
                for t in VARIANT_TYPE:
                    # seemingly odd accounting. The number of predicted variants *changes* as a result of rescuing.
                    # e.g. 2 predicted FPs are in fact 1 FN. So
                    #  -- remove 2 predicted variants
                    #  -- remove 2 false positives
                    #  -- remove 1 false negative
                    #  -- add 1 true positive
                    self.num_pred[t] -= rm_fp[t]
                    self.num_fp[t] -= rm_fp[t]
                    self.num_pred[t] += new_tp[t]
                    self.num_fn[t] -= new_tp[t]
                    self.num_tp[t] += new_tp[t]
                for v in rescued_vars:
                    self.rescued_vars._add_variant(v)

    def _nrd_counts(self,var_type):
        genoGenoCounts = self.genotype_concordance[var_type]
        nWrong = genoGenoCounts[GENOTYPE_TYPE.HOM_REF][GENOTYPE_TYPE.HET]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_REF][GENOTYPE_TYPE.HOM_VAR]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_REF]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_VAR]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_REF]
        nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HET]
        nTotal = nWrong + genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HET] + genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR]
        return (nWrong,nTotal)

    def to_dict(self,var_type):
        stats = {}
        stats['num_true'] = self.num_true[var_type]
        stats['num_pred'] = self.num_pred[var_type]
        stats['false_positives'] = self.num_fp[var_type]
        stats['false_negatives'] = self.num_fn[var_type]
        stats['good_predictions'] = self.num_tp[var_type]
        stats['intersect_bad'] = len(self.intersect_bad[var_type])
        nrd_wrong,nrd_total = self._nrd_counts(var_type)
        stats['nrd_wrong'] = nrd_wrong
        stats['nrd_total'] = nrd_total
        stats['known_fp_calls'] = self.calls_at_known_fp[var_type] if self.calls_at_known_fp else 0
        stats['known_fp'] = self.known_fp[var_type] if self.calls_at_known_fp else 0
        return stats