def prob(self, grapheme, reading, alt_reading): """ Returns the probability of P(r|k), using the formula: P(r|k) ~ (alpha)P_raw(r|k) + (1-alpha)P(r|r*)P(r*|k). """ if scripts.to_hiragana(grapheme) == scripts.to_hiragana(alt_reading): # Special case: where the segment is phonetic. return 1.0 # We only handle entire kanji segments. assert scripts.script_types(grapheme) == set([scripts.Script.Kanji]) alpha = settings.ALTERNATION_ALPHA assert 0 <= alpha <= 1 try: rawProb = self.raw_freq_dist[grapheme].freq(alt_reading) except KeyError: rawProb = 0.0 normalizedProb = self.normalized_freq_dist[grapheme].freq(reading) alternationProb = self.alternation_dist[reading].freq(alt_reading) result = alpha*rawProb + (1-alpha)*normalizedProb*alternationProb return result
def _get_kanji_readings(alignments): """ Develop a set of readings for each kanji which a learner must know as part of this syllabus. This set may contain invalid readings, and will later be pruned to only valid readings. """ kanji_script = scripts.Script.Kanji readings = {} for alignment in alignments: alignment_len = len(alignment) for i, (g_seg, p_seg) in enumerate(zip(alignment.g_segs, alignment.p_segs)): if len(g_seg) > 1 or scripts.script_types(g_seg) != kanji_script: continue reading_set = readings.setdefault(g_seg, set()) reading_set.add(p_seg) has_left_context = i > 0 has_right_context = i < alignment_len - 1 extra_variants = alternations.canonicalSegmentForms(p_seg, leftContext=has_left_context, rightContext=has_right_context) reading_set.update(extra_variants) return readings
def _format_alignment(alignment): result = [] for g_seg, p_seg in zip(alignment.g_segs, alignment.p_segs): if scripts.script_types(g_seg) == scripts.Script.Kanji: result.append(p_seg) else: result.extend(p_seg) return '|'.join(result)
def addItem(self): if unicode(self.input.text()).strip() != '': if scripts.Script.Ascii in scripts.script_types(self.input.text()): pass else: if not self.input.text() in self.user_list: self.user_list.append(self.input.text()) self.appendToList(self.input.text()) if self.enter.isChecked(): self.input.clear()
def get_accuracy_by_pivot_type(): cursor = connection.cursor() cursor.execute(""" SELECT question.pivot, SUM(chosen_option.is_correct) as n_correct, COUNT(*) as n_responses FROM ( SELECT mco.question_id, mco.is_correct FROM drill_multiplechoiceresponse AS mcr INNER JOIN drill_multiplechoiceoption AS mco ON mcr.option_id = mco.id ) as chosen_option INNER JOIN drill_question AS question ON chosen_option.question_id = question.id WHERE question.pivot_type = "w" GROUP BY question.pivot """) raw_data = cursor.fetchall() counts = { 'Hiragana': FreqDist(), 'Katakana': FreqDist(), 'Kanji': FreqDist() } complex_scripts = set([scripts.Script.Kanji, scripts.Script.Unknown]) only_katakana = set([scripts.Script.Katakana]) for word, n_correct, n_responses in raw_data: scripts_found = scripts.script_types(word) if scripts_found.intersection(complex_scripts): dist = counts['Kanji'] elif scripts_found.intersection(only_katakana): dist = counts['Katakana'] else: dist = counts['Hiragana'] dist.inc(True, int(n_correct)) dist.inc(False, int(n_responses - n_correct)) keys = ('Hiragana', 'Katakana', 'Kanji') data = [(key, counts[key].freq(True)) for key in keys] average = FreqDist() for key in keys: average.inc(True, counts[key][True]) average.inc(False, counts[key][False]) data.append(('Average', average.freq(True))) return data
def get_accuracy_by_pivot_type(): cursor = connection.cursor() cursor.execute(""" SELECT question.pivot, SUM(chosen_option.is_correct) as n_correct, COUNT(*) as n_responses FROM ( SELECT mco.question_id, mco.is_correct FROM drill_multiplechoiceresponse AS mcr INNER JOIN drill_multiplechoiceoption AS mco ON mcr.option_id = mco.id ) as chosen_option INNER JOIN drill_question AS question ON chosen_option.question_id = question.id WHERE question.pivot_type = "w" GROUP BY question.pivot """) raw_data = cursor.fetchall() counts = {'Hiragana': FreqDist(), 'Katakana': FreqDist(), 'Kanji': FreqDist()} complex_scripts = set([scripts.Script.Kanji, scripts.Script.Unknown]) only_katakana = set([scripts.Script.Katakana]) for word, n_correct, n_responses in raw_data: scripts_found = scripts.script_types(word) if scripts_found.intersection(complex_scripts): dist = counts['Kanji'] elif scripts_found.intersection(only_katakana): dist = counts['Katakana'] else: dist = counts['Hiragana'] dist.inc(True, int(n_correct)) dist.inc(False, int(n_responses - n_correct)) keys = ('Hiragana', 'Katakana', 'Kanji') data = [(key, counts[key].freq(True)) for key in keys] average = FreqDist() for key in keys: average.inc(True, counts[key][True]) average.inc(False, counts[key][False]) data.append(('Average', average.freq(True))) return data
def lookup(self, query): found = None if self.config.ignore_kana(): if len(scripts.script_types(query)) is 1: if scripts.script_type(query) is scripts.Script.Hiragana or scripts.script_type(query) is scripts.Script.Katakana: return found try: found = self.edict[query] if self.config.ignore_duplicates(): if found.word in self.stats: found = None else: self.stats.append(found.word) except KeyError: if query not in self.missed: self.missed.append(query) finally: return found
def update(self, response): "Update our error model from a user's response." error_dist = models.ErrorDist.objects.get(user=response.user, tag=self.dist_name) question = response.question base_segs = question.annotation.split(u'|') response_segs = response.option.annotation.split(u'|') distractor_sets = map( set, zip(*[ o['annotation'].split('|') for o in question.multiplechoicequestion.options.values('annotation') if o['annotation'] != response.option.annotation ])) assert len(base_segs) == len(response_segs) == len(distractor_sets) for base_seg, response_seg, distractor_segs in \ izip(base_segs, response_segs, distractor_sets): if scripts.script_types(base_seg) != scripts.Script.Kanji: continue sub_dist = models.ProbDist.from_query_set( error_dist.density.filter(condition=base_seg)) e = settings.UPDATE_EPSILON try: m = max(imap(sub_dist.__getitem__, distractor_segs)) + e existing_score = sub_dist[response_seg] except KeyError: raise UpdateError( u'for user %s, dist %s, response %d: no entry for %s|%s' % ( response.user.username, self.dist_name, response.id, response_seg, base_seg, )) if m > existing_score: sub_dist[response_seg] = m sub_dist.normalise() sub_dist.save_to(error_dist.density, condition=base_seg) return
def lookup(self, query): found = None if self.config.ignore_kana(): if len(scripts.script_types(query)) is 1: if scripts.script_type( query ) is scripts.Script.Hiragana or scripts.script_type( query) is scripts.Script.Katakana: return found try: found = self.edict[query] if self.config.ignore_duplicates(): if found.word in self.stats: found = None else: self.stats.append(found.word) except KeyError: if query not in self.missed: self.missed.append(query) finally: return found
def update(self, response): "Update our error model from a user's response." error_dist = models.ErrorDist.objects.get(user=response.user, tag=self.dist_name) question = response.question base_segs = question.annotation.split(u'|') response_segs = response.option.annotation.split(u'|') distractor_sets = map(set, zip( *[o['annotation'].split('|') for o in question.multiplechoicequestion.options.values( 'annotation') if o['annotation'] != response.option.annotation] )) assert len(base_segs) == len(response_segs) == len(distractor_sets) for base_seg, response_seg, distractor_segs in \ izip(base_segs, response_segs, distractor_sets): if scripts.script_types(base_seg) != scripts.Script.Kanji: continue sub_dist = models.ProbDist.from_query_set( error_dist.density.filter(condition=base_seg)) e = settings.UPDATE_EPSILON try: m = max(imap(sub_dist.__getitem__, distractor_segs)) + e existing_score = sub_dist[response_seg] except KeyError: raise UpdateError( u'for user %s, dist %s, response %d: no entry for %s|%s' % ( response.user.username, self.dist_name, response.id, response_seg, base_seg, )) if m > existing_score: sub_dist[response_seg] = m sub_dist.normalise() sub_dist.save_to(error_dist.density, condition=base_seg) return
def _jpn(token): """Convert jpn token to phonemes.""" from cjktools import scripts from cjktools.resources import kanjidic lkp = {} for fn in ["lib/data/phon/ja-Hira", "lib/data/phon/ja-Kata"]: lines = open(fn).readlines() if len(lines) == 0: continue for line in lines: if line.strip() == "": continue kv = line.strip().split("\t") if len(kv) != 2: print("!", kv, file=sys.stderr) continue k = kv[0].strip() v = kv[1].strip() if k not in lkp: lkp[k] = [] lkp[k].append(v) kjd = kanjidic.Kanjidic(kanjidic_files=["lib/data/dict/ja"]) op = "" segs = scripts.script_boundaries(token) for seg in segs: tipus = scripts.script_types(seg) if 3 in tipus: for ch in seg: if ch in kjd: if len(kjd[ch].on_readings) > 0: op += kjd[ch].on_readings[0] else: op += seg res = _maxphon(lkp, op) if res == "": return "?" return res
def process_response(self, request, response): if response.status_code != 200: return response if not response.get('Content-Type', '').startswith('text/html'): return response content = response.content.decode('utf8') if not scripts.script_types(content).intersection( self.japanese_scripts): return response parts = [] for part in scripts.script_boundaries(content): if scripts.script_type(part) in self.japanese_scripts: parts.append('<span lang="ja" xml:lang="ja">%s</span>' % part) else: parts.append(part) response.content = u''.join(parts).encode('utf8') return response
def check_scripts(plain): return scripts.Script.Ascii in scripts.script_types(plain)
def _get_stimulus_class(self, stimulus): if scripts.script_types(stimulus) == scripts.Script.Ascii: return 'stimulus_roman' else: return 'stimulus_cjk'