def __init__(self, true_var, pred_var, true_positives, false_positives, false_negatives, concordance): self.chrom = true_var.chrom self.true_var = true_var self.pred_var = pred_var self.num_true = dict( map(lambda t: [t, true_var.var_num(t)], VARIANT_TYPE)) self.num_pred = dict( map(lambda t: [t, pred_var.var_num(t)], VARIANT_TYPE)) self.num_tp = _type_dict() # true positives as int self.num_fp = _type_dict() self.num_fn = _type_dict() self.false_positives = self._extract(pred_var, false_positives, self.num_fp) # chromvariants self.false_negatives = self._extract(true_var, false_negatives, self.num_fn) # chromvariants self.true_positives = self._extract(true_var, true_positives, self.num_tp) # chromvariants self.rescued_vars = ChromVariants( self.chrom, self.false_positives._max_indel_len) # populate with rescued vars self.intersect_bad = None # set externally self.known_fp = None # set externally self.calls_at_known_fp = None # set externally self.known_fp_variants = None # set externally self.genotype_concordance = concordance
def _extract(self,chromvariant,locset,num_type_dict): clone = ChromVariants(chromvariant.chrom, chromvariant._max_indel_len) for loc in locset: var = chromvariant.all_variants[loc] clone._add_variant(var) num_type_dict[var.var_type] += 1 clone._ensure_sorted() return clone
def evaluate_low_memory_chrom(chrom_name, true_chrom, pred_chrom, known_fp=None): trueChromVariants = ChromVariants(chrom_name, max_indel_len) for r in true_chrom: trueChromVariants.add_record(r) predChromVariants = ChromVariants(chrom_name, max_indel_len) for r in pred_chrom: predChromVariants.add_record(r) knownFpChromVariants = ChromVariants(chrom_name, max_indel_len) cvs = chrom_evaluate_variants(trueChromVariants, predChromVariants, eps, eps_bp, ref, window, known_fp) aggregate_stats(genome_stats, cvs) # side effects, sorry if writer: write_annotated_var(writer, cvs)
def _extract(self, chromvariant, locset, num_type_dict): clone = ChromVariants(chromvariant.chrom, chromvariant._max_indel_len) for loc in locset: var = chromvariant.all_variants[loc] clone._add_variant(var) num_type_dict[var.var_type] += 1 clone._ensure_sorted() return clone
def evaluate_low_memory_chrom(chrom_name,true_chrom,pred_chrom,known_fp=None): trueChromVariants = ChromVariants(chrom_name,max_indel_len) for r in true_chrom: trueChromVariants.add_record(r) predChromVariants = ChromVariants(chrom_name,max_indel_len) for r in pred_chrom: predChromVariants.add_record(r) knownFpChromVariants = ChromVariants(chrom_name,max_indel_len) cvs = chrom_evaluate_variants(trueChromVariants,predChromVariants,eps,eps_bp,ref,window,known_fp) aggregate_stats(genome_stats,cvs) # side effects, sorry if writer: write_annotated_var(writer,cvs)
def __init__(self, true_var, pred_var, true_positives, false_positives, false_negatives,concordance): self.chrom = true_var.chrom self.true_var = true_var self.pred_var = pred_var self.num_true = dict(map(lambda t: [t,true_var.var_num(t)],VARIANT_TYPE)) self.num_pred = dict(map(lambda t: [t,pred_var.var_num(t)],VARIANT_TYPE)) self.num_tp = _type_dict() # true positives as int self.num_fp = _type_dict() self.num_fn = _type_dict() self.false_positives = self._extract(pred_var,false_positives,self.num_fp) # chromvariants self.false_negatives = self._extract(true_var,false_negatives,self.num_fn) # chromvariants self.true_positives = self._extract(true_var,true_positives,self.num_tp) # chromvariants self.rescued_vars = ChromVariants(self.chrom,self.false_positives._max_indel_len) # populate with rescued vars self.intersect_bad = None # set externally self.known_fp = None # set externally self.calls_at_known_fp = None # set externally self.known_fp_variants = None # set externally self.genotype_concordance = concordance
class ChromVariantStats: """Stats for a certain contig's worth of variants.""" def __init__(self, true_var, pred_var, true_positives, false_positives, false_negatives, concordance): self.chrom = true_var.chrom self.true_var = true_var self.pred_var = pred_var self.num_true = dict( map(lambda t: [t, true_var.var_num(t)], VARIANT_TYPE)) self.num_pred = dict( map(lambda t: [t, pred_var.var_num(t)], VARIANT_TYPE)) self.num_tp = _type_dict() # true positives as int self.num_fp = _type_dict() self.num_fn = _type_dict() self.false_positives = self._extract(pred_var, false_positives, self.num_fp) # chromvariants self.false_negatives = self._extract(true_var, false_negatives, self.num_fn) # chromvariants self.true_positives = self._extract(true_var, true_positives, self.num_tp) # chromvariants self.rescued_vars = ChromVariants( self.chrom, self.false_positives._max_indel_len) # populate with rescued vars self.intersect_bad = None # set externally self.known_fp = None # set externally self.calls_at_known_fp = None # set externally self.known_fp_variants = None # set externally self.genotype_concordance = concordance # for loc in true_positives: # var = true_var.all_variants[loc] # self.num_tp[var.var_type] += 1 def _extract(self, chromvariant, locset, num_type_dict): clone = ChromVariants(chromvariant.chrom, chromvariant._max_indel_len) for loc in locset: var = chromvariant.all_variants[loc] clone._add_variant(var) num_type_dict[var.var_type] += 1 clone._ensure_sorted() return clone def rectify(self, ref, window): """Rescue variants from VCF ambiguity. Given reference genome and window of sequence comparison, fix each error estimated to be an artifact of VCF ambiguity. Calling this function strictly improves realism of evaluation, but doesn't perfectly detect ambiguity. TODO: Don't do redundant calculation for overlapping windows TODO: This should return two bools: events match, and genotypes match Note: Here the window is a single integer for the size. Here it gets converted specifically to an interval, and that's its subsequent usage in the stack. """ locs_to_rescue = list( map(lambda loc: loc, self.false_negatives.all_locations)) # note we needed to force a copy here, since rescue_mission is modifying the false-negative sets for loc in locs_to_rescue: if (loc in self.false_negatives.all_variants ): # if the element is still in the set of false negatives new_tp, rm_fp, rescued_vars = rescue_mission( self.false_negatives, self.false_positives, self.true_positives, loc, ref, window) for t in VARIANT_TYPE: # seemingly odd accounting. The number of predicted variants *changes* as a result of rescuing. # e.g. 2 predicted FPs are in fact 1 FN. So # -- remove 2 predicted variants # -- remove 2 false positives # -- remove 1 false negative # -- add 1 true positive self.num_pred[t] -= rm_fp[t] self.num_fp[t] -= rm_fp[t] self.num_pred[t] += new_tp[t] self.num_fn[t] -= new_tp[t] self.num_tp[t] += new_tp[t] for v in rescued_vars: self.rescued_vars._add_variant(v) def _nrd_counts(self, var_type): genoGenoCounts = self.genotype_concordance[var_type] nWrong = genoGenoCounts[GENOTYPE_TYPE.HOM_REF][GENOTYPE_TYPE.HET] nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_REF][GENOTYPE_TYPE.HOM_VAR] nWrong += genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_REF] nWrong += genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_VAR] nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_REF] nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HET] nTotal = nWrong + genoGenoCounts[GENOTYPE_TYPE.HET][ GENOTYPE_TYPE.HET] + genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][ GENOTYPE_TYPE.HOM_VAR] return (nWrong, nTotal) def to_dict(self, var_type): stats = {} stats['num_true'] = self.num_true[var_type] stats['num_pred'] = self.num_pred[var_type] stats['false_positives'] = self.num_fp[var_type] stats['false_negatives'] = self.num_fn[var_type] stats['good_predictions'] = self.num_tp[var_type] stats['intersect_bad'] = len(self.intersect_bad[var_type]) nrd_wrong, nrd_total = self._nrd_counts(var_type) stats['nrd_wrong'] = nrd_wrong stats['nrd_total'] = nrd_total stats['known_fp_calls'] = self.calls_at_known_fp[ var_type] if self.calls_at_known_fp else 0 stats['known_fp'] = self.known_fp[ var_type] if self.calls_at_known_fp else 0 return stats
def _virgin_chrom(self, chrom): """Return an empty set of variants on a given chromosome.""" return ChromVariants(chrom, self._max_indel_len, **self._args)
class ChromVariantStats: """Stats for a certain contig's worth of variants.""" def __init__(self, true_var, pred_var, true_positives, false_positives, false_negatives,concordance): self.chrom = true_var.chrom self.true_var = true_var self.pred_var = pred_var self.num_true = dict(map(lambda t: [t,true_var.var_num(t)],VARIANT_TYPE)) self.num_pred = dict(map(lambda t: [t,pred_var.var_num(t)],VARIANT_TYPE)) self.num_tp = _type_dict() # true positives as int self.num_fp = _type_dict() self.num_fn = _type_dict() self.false_positives = self._extract(pred_var,false_positives,self.num_fp) # chromvariants self.false_negatives = self._extract(true_var,false_negatives,self.num_fn) # chromvariants self.true_positives = self._extract(true_var,true_positives,self.num_tp) # chromvariants self.rescued_vars = ChromVariants(self.chrom,self.false_positives._max_indel_len) # populate with rescued vars self.intersect_bad = None # set externally self.known_fp = None # set externally self.calls_at_known_fp = None # set externally self.known_fp_variants = None # set externally self.genotype_concordance = concordance # for loc in true_positives: # var = true_var.all_variants[loc] # self.num_tp[var.var_type] += 1 def _extract(self,chromvariant,locset,num_type_dict): clone = ChromVariants(chromvariant.chrom, chromvariant._max_indel_len) for loc in locset: var = chromvariant.all_variants[loc] clone._add_variant(var) num_type_dict[var.var_type] += 1 clone._ensure_sorted() return clone def rectify(self, ref, window): """Rescue variants from VCF ambiguity. Given reference genome and window of sequence comparison, fix each error estimated to be an artifact of VCF ambiguity. Calling this function strictly improves realism of evaluation, but doesn't perfectly detect ambiguity. TODO: Don't do redundant calculation for overlapping windows TODO: This should return two bools: events match, and genotypes match Note: Here the window is a single integer for the size. Here it gets converted specifically to an interval, and that's its subsequent usage in the stack. """ locs_to_rescue = list(map(lambda loc: loc, self.false_negatives.all_locations)) # note we needed to force a copy here, since rescue_mission is modifying the false-negative sets for loc in locs_to_rescue: if ( loc in self.false_negatives.all_variants ): # if the element is still in the set of false negatives new_tp,rm_fp,rescued_vars = rescue_mission(self.false_negatives,self.false_positives,self.true_positives,loc,ref,window) for t in VARIANT_TYPE: # seemingly odd accounting. The number of predicted variants *changes* as a result of rescuing. # e.g. 2 predicted FPs are in fact 1 FN. So # -- remove 2 predicted variants # -- remove 2 false positives # -- remove 1 false negative # -- add 1 true positive self.num_pred[t] -= rm_fp[t] self.num_fp[t] -= rm_fp[t] self.num_pred[t] += new_tp[t] self.num_fn[t] -= new_tp[t] self.num_tp[t] += new_tp[t] for v in rescued_vars: self.rescued_vars._add_variant(v) def _nrd_counts(self,var_type): genoGenoCounts = self.genotype_concordance[var_type] nWrong = genoGenoCounts[GENOTYPE_TYPE.HOM_REF][GENOTYPE_TYPE.HET] nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_REF][GENOTYPE_TYPE.HOM_VAR] nWrong += genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_REF] nWrong += genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HOM_VAR] nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_REF] nWrong += genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HET] nTotal = nWrong + genoGenoCounts[GENOTYPE_TYPE.HET][GENOTYPE_TYPE.HET] + genoGenoCounts[GENOTYPE_TYPE.HOM_VAR][GENOTYPE_TYPE.HOM_VAR] return (nWrong,nTotal) def to_dict(self,var_type): stats = {} stats['num_true'] = self.num_true[var_type] stats['num_pred'] = self.num_pred[var_type] stats['false_positives'] = self.num_fp[var_type] stats['false_negatives'] = self.num_fn[var_type] stats['good_predictions'] = self.num_tp[var_type] stats['intersect_bad'] = len(self.intersect_bad[var_type]) nrd_wrong,nrd_total = self._nrd_counts(var_type) stats['nrd_wrong'] = nrd_wrong stats['nrd_total'] = nrd_total stats['known_fp_calls'] = self.calls_at_known_fp[var_type] if self.calls_at_known_fp else 0 stats['known_fp'] = self.known_fp[var_type] if self.calls_at_known_fp else 0 return stats