def update(self, condition, symbol, symbol_set): query = self.density.filter(condition=condition) whole_dist = ProbDist.from_query_set(query) sub_dist = ProbDist.from_query_set(query.filter( symbol__in=symbol_set)) assert sub_dist m = max(v for (s, v) in sub_dist.iteritems() if s != symbol) + \ settings.UPDATE_EPSILON if sub_dist[symbol] >= m: # Nothing to say here. return # Increase the likelihood of seeing the symbol sub_dist[symbol] = m sub_dist.normalise() sub_dist_mass = sum(map(whole_dist.__getitem__, sub_dist.keys())) for s in sub_dist: whole_dist[s] = sub_dist[s] * sub_dist_mass assert abs(sum(whole_dist.values()) - 1.0) < 1e-6 whole_dist.save_to(self.density, condition=condition) return
def sample_seq_n(self, condition_segments, n, exclude_set=None): dists = [] kanji_script = scripts.Script.Kanji for segment in condition_segments: if scripts.script_type(segment) == kanji_script: seg_dist = ProbDist.from_query_set( self.density.filter(condition=segment)) dists.append(seg_dist) else: dists.append(segment) return SeqDist(*dists).sample_n(n, exclude_set)
def sample_seq_n(self, condition_segments, n, exclude_set=None): dists = [] kanji_script = scripts.Script.Kanji for segment in condition_segments: if scripts.script_type(segment) == kanji_script: seg_dist = ProbDist.from_query_set(self.density.filter( condition=segment)) dists.append(seg_dist) else: dists.append(segment) return SeqDist(*dists).sample_n(n, exclude_set)
def update(self, condition, symbol, symbol_set): query = self.density.filter(condition=condition) whole_dist = ProbDist.from_query_set(query) sub_dist = ProbDist.from_query_set(query.filter(symbol__in=symbol_set)) assert sub_dist m = max(v for (s, v) in sub_dist.iteritems() if s != symbol) + \ settings.UPDATE_EPSILON if sub_dist[symbol] >= m: # Nothing to say here. return # Increase the likelihood of seeing the symbol sub_dist[symbol] = m sub_dist.normalise() sub_dist_mass = sum(map(whole_dist.__getitem__, sub_dist.keys())) for s in sub_dist: whole_dist[s] = sub_dist[s] * sub_dist_mass assert abs(sum(whole_dist.values()) - 1.0) < 1e-6 whole_dist.save_to(self.density, condition=condition) return
def _pad_readings(self, prior_dist): """ Once the reading distribution has been copied over, we still have the problem that there may not be enough erroneous readings to meet the minimum number of distractors we wish to generate. To circumvent this problem, we pad with random distractors. """ _log.log('Padding results ', newLine=False) conditions = set(o['condition'] for o in \ prior_dist.density.all().values('condition')) for (condition,) in consoleLog.withProgress(conditions): exclude_set = set( o.reading for o in \ lexicon_models.KanjiReading.objects.filter( kanji__kanji=condition) ) n_stored = prior_dist.density.filter(condition=condition).exclude( symbol__in=exclude_set).count() sub_dist = ProbDist.from_query_set(prior_dist.density.filter( condition=condition)) exclude_set.update(sub_dist.keys()) n_needed = settings.MIN_TOTAL_DISTRACTORS - n_stored min_prob = min(sub_dist.itervalues()) / 2 while n_needed > 0: for row in lexicon_models.KanjiReadingProb.sample_n(n_needed): if row.symbol not in exclude_set: sub_dist[row.symbol] = min_prob exclude_set.add(row.symbol) n_needed -= 1 if n_needed == 0: break sub_dist.normalise() sub_dist.save_to(prior_dist.density, condition=condition) return
def sample_n(self, condition, n, exclude_set=None): "Samples n symbols without replacement from the distribution." dist = ProbDist.from_query_set( self.density.filter(condition=condition)) return dist.sample_n(n, exclude_set=exclude_set)
def sample_n(self, condition, n, exclude_set=None): "Samples n symbols without replacement from the distribution." dist = ProbDist.from_query_set(self.density.filter( condition=condition)) return dist.sample_n(n, exclude_set=exclude_set)